summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorunknown <jani@hynda.mysql.fi>2007-09-27 17:05:07 +0300
committerunknown <jani@hynda.mysql.fi>2007-09-27 17:05:07 +0300
commitf4afcec393e9c75531220c1fe1f5d387891549fa (patch)
treef199672f3b02bf9f3109ee96272febb917f2e036 /storage
parent9a640f848e465f2bebd27072fa651a230e9632f9 (diff)
parent8b5dddbc006afe8f6dae8408cea7481c17dade72 (diff)
downloadmariadb-git-f4afcec393e9c75531220c1fe1f5d387891549fa.tar.gz
Merge hynda.mysql.fi:/home/my/mysql-5.1-main
into hynda.mysql.fi:/home/my/mysql-maria BitKeeper/etc/ignore: auto-union BUILD/SETUP.sh: Auto merged BitKeeper/deleted/.del-CMakeLists.txt~99a50df6: Auto merged Makefile.am: Auto merged client/mysqldump.c: Auto merged configure.in: Auto merged include/Makefile.am: Auto merged include/keycache.h: Auto merged include/m_string.h: Auto merged include/my_sys.h: Auto merged libmysqld/Makefile.am: Auto merged mysql-test/extra/rpl_tests/rpl_insert_delayed.test: Auto merged mysql-test/lib/mtr_cases.pl: Auto merged mysql-test/lib/mtr_misc.pl: Auto merged mysql-test/lib/mtr_process.pl: Auto merged mysql-test/lib/mtr_report.pl: Auto merged mysql-test/lib/mtr_timer.pl: Auto merged mysql-test/mysql-test-run.pl: Auto merged mysql-test/r/alter_table.result: Auto merged mysql-test/r/merge.result: Auto merged mysql-test/r/mysqldump.result: Auto merged mysql-test/r/query_cache.result: Auto merged mysql-test/r/subselect.result: Auto merged mysql-test/r/view.result: Auto merged mysql-test/suite/ndb/r/ps_7ndb.result: Auto merged mysql-test/suite/rpl/r/rpl_events.result: Auto merged mysql-test/suite/rpl/r/rpl_insert.result: Auto merged mysql-test/suite/rpl/r/rpl_row_insert_delayed.result: Auto merged mysql-test/suite/rpl/r/rpl_stm_flsh_tbls.result: Auto merged mysql-test/suite/rpl/r/rpl_stm_insert_delayed.result: Auto merged mysql-test/suite/rpl/r/rpl_switch_stm_row_mixed.result: Auto merged mysql-test/suite/rpl/t/rpl_insert.test: Auto merged mysql-test/suite/rpl/t/rpl_stm_flsh_tbls.test: Auto merged mysql-test/suite/rpl/t/rpl_switch_stm_row_mixed.test: Auto merged mysql-test/t/alter_table.test: Auto merged mysql-test/t/myisam.test: Auto merged mysql-test/t/mysqldump.test: Auto merged mysql-test/t/query_cache.test: Auto merged mysql-test/t/subselect.test: Auto merged mysql-test/t/view.test: Auto merged mysys/array.c: Auto merged mysys/mf_keycache.c: Auto merged mysys/my_init.c: Auto merged mysys/my_symlink2.c: Auto merged mysys/safemalloc.c: Auto merged mysys/thr_lock.c: Auto merged sql/Makefile.am: Auto merged sql/filesort.cc: Auto merged sql/ha_ndbcluster.cc: Auto merged sql/ha_partition.cc: Auto merged sql/ha_partition.h: Auto merged sql/handler.h: Auto merged sql/item_func.cc: Auto merged sql/item_func.h: Auto merged sql/item_xmlfunc.cc: Auto merged sql/lock.cc: Auto merged sql/log.cc: Auto merged sql/mysql_priv.h: Auto merged sql/mysqld.cc: Auto merged sql/net_serv.cc: Auto merged sql/opt_range.cc: Auto merged sql/set_var.cc: Auto merged sql/slave.cc: Auto merged sql/slave.h: Auto merged sql/sql_class.cc: Auto merged sql/sql_delete.cc: Auto merged sql/sql_insert.cc: Auto merged sql/sql_parse.cc: Auto merged sql/sql_select.cc: Auto merged sql/sql_table.cc: Auto merged sql/share/errmsg.txt: Auto merged sql/sql_test.cc: Auto merged sql/table.h: Auto merged sql/udf_example.c: Auto merged sql/uniques.cc: Auto merged sql/unireg.cc: Auto merged storage/csv/ha_tina.h: Auto merged storage/myisam/ft_boolean_search.c: Auto merged storage/myisam/ft_nlq_search.c: Auto merged storage/myisam/ft_parser.c: Auto merged storage/myisam/ft_stopwords.c: Auto merged storage/myisam/ha_myisam.cc: Auto merged storage/myisam/ha_myisam.h: Auto merged storage/myisam/mi_check.c: Auto merged storage/myisam/mi_create.c: Auto merged storage/myisam/mi_delete.c: Auto merged storage/myisam/mi_dynrec.c: Auto merged storage/myisam/mi_extra.c: Auto merged storage/myisam/mi_key.c: Auto merged storage/myisam/mi_locking.c: Auto merged storage/myisam/mi_log.c: Auto merged storage/myisam/mi_packrec.c: Auto merged storage/myisam/mi_rkey.c: Auto merged storage/myisam/mi_search.c: Auto merged storage/myisam/mi_test1.c: Auto merged storage/myisam/mi_test2.c: Auto merged storage/myisam/mi_update.c: Auto merged storage/myisam/mi_write.c: Auto merged storage/myisam/myisamchk.c: Auto merged storage/myisam/myisamlog.c: Auto merged storage/myisam/myisampack.c: Auto merged storage/myisam/sort.c: Auto merged storage/myisam/sp_test.c: Auto merged storage/myisammrg/ha_myisammrg.cc: Auto merged storage/myisammrg/ha_myisammrg.h: Auto merged storage/ndb/src/mgmapi/mgmapi.cpp: Auto merged support-files/compiler_warnings.supp: Auto merged client/mysqltest.c: Manual merge between mysql-5.1 and mysql-maria include/my_base.h: Manual merge between mysql-5.1 and mysql-maria include/my_global.h: Manual merge between mysql-5.1 and mysql-maria include/myisam.h: Manual merge between mysql-5.1 and mysql-maria libmysql/Makefile.shared: Manual merge between mysql-5.1 and mysql-maria mysql-test/r/events_logs_tests.result: Manual merge between mysql-5.1 and mysql-maria mysql-test/suite/rpl/r/rpl_row_flsh_tbls.result: Manual merge between mysql-5.1 and mysql-maria mysql-test/suite/rpl/t/rpl_row_flsh_tbls.test: Manual merge between mysql-5.1 and mysql-maria mysql-test/t/disabled.def: Manual merge between mysql-5.1 and mysql-maria mysql-test/t/events_logs_tests.test: Manual merge between mysql-5.1 and mysql-maria mysys/my_compress.c: Manual merge between mysql-5.1 and mysql-maria mysys/my_getsystime.c: Manual merge between mysql-5.1 and mysql-maria mysys/my_open.c: Manual merge between mysql-5.1 and mysql-maria sql/handler.cc: Manual merge between mysql-5.1 and mysql-maria sql/set_var.h: Manual merge between mysql-5.1 and mysql-maria sql/sql_class.h: Manual merge between mysql-5.1 and mysql-maria sql/sql_show.cc: Manual merge between mysql-5.1 and mysql-maria sql/sql_sort.h: Manual merge between mysql-5.1 and mysql-maria sql/sql_yacc.yy: Manual merge between mysql-5.1 and mysql-maria sql/table.cc: Manual merge between mysql-5.1 and mysql-maria storage/csv/ha_tina.cc: Manual merge between mysql-5.1 and mysql-maria storage/myisam/mi_open.c: Manual merge between mysql-5.1 and mysql-maria storage/myisam/myisamdef.h: Manual merge between mysql-5.1 and mysql-maria unittest/mysys/my_atomic-t.c: Manual merge between mysql-5.1 and mysql-maria
Diffstat (limited to 'storage')
-rw-r--r--storage/Makefile.am7
-rw-r--r--storage/csv/ha_tina.cc12
-rw-r--r--storage/csv/ha_tina.h4
-rw-r--r--storage/maria/CMakeLists.txt1
-rw-r--r--storage/maria/Makefile.am172
-rw-r--r--storage/maria/ft_maria.c48
-rw-r--r--storage/maria/ha_maria.cc2436
-rw-r--r--storage/maria/ha_maria.h151
-rw-r--r--storage/maria/lockman.c786
-rw-r--r--storage/maria/lockman.h76
-rw-r--r--storage/maria/ma_bitmap.c2077
-rw-r--r--storage/maria/ma_blockrec.c5279
-rw-r--r--storage/maria/ma_blockrec.h195
-rw-r--r--storage/maria/ma_cache.c107
-rw-r--r--storage/maria/ma_changed.c33
-rw-r--r--storage/maria/ma_check.c5633
-rw-r--r--storage/maria/ma_checkpoint.c1108
-rw-r--r--storage/maria/ma_checkpoint.h81
-rw-r--r--storage/maria/ma_checksum.c72
-rw-r--r--storage/maria/ma_close.c156
-rw-r--r--storage/maria/ma_commit.c124
-rw-r--r--storage/maria/ma_commit.h18
-rw-r--r--storage/maria/ma_control_file.c325
-rw-r--r--storage/maria/ma_control_file.h80
-rw-r--r--storage/maria/ma_create.c1279
-rw-r--r--storage/maria/ma_dbug.c193
-rw-r--r--storage/maria/ma_delete.c891
-rw-r--r--storage/maria/ma_delete_all.c161
-rw-r--r--storage/maria/ma_delete_table.c111
-rw-r--r--storage/maria/ma_dynrec.c1972
-rw-r--r--storage/maria/ma_extra.c623
-rw-r--r--storage/maria/ma_ft_boolean_search.c975
-rw-r--r--storage/maria/ma_ft_eval.c254
-rw-r--r--storage/maria/ma_ft_eval.h41
-rw-r--r--storage/maria/ma_ft_nlq_search.c374
-rw-r--r--storage/maria/ma_ft_parser.c426
-rw-r--r--storage/maria/ma_ft_stem.c18
-rw-r--r--storage/maria/ma_ft_test1.c317
-rw-r--r--storage/maria/ma_ft_test1.h420
-rw-r--r--storage/maria/ma_ft_update.c352
-rw-r--r--storage/maria/ma_ftdefs.h152
-rw-r--r--storage/maria/ma_fulltext.h27
-rw-r--r--storage/maria/ma_info.c141
-rw-r--r--storage/maria/ma_init.c67
-rw-r--r--storage/maria/ma_key.c569
-rw-r--r--storage/maria/ma_keycache.c163
-rw-r--r--storage/maria/ma_locking.c570
-rw-r--r--storage/maria/ma_loghandler.c6778
-rw-r--r--storage/maria/ma_loghandler.h364
-rw-r--r--storage/maria/ma_loghandler_lsn.h100
-rw-r--r--storage/maria/ma_open.c1577
-rw-r--r--storage/maria/ma_packrec.c1717
-rw-r--r--storage/maria/ma_page.c188
-rwxr-xr-xstorage/maria/ma_pagecache.c4197
-rw-r--r--storage/maria/ma_pagecache.h267
-rw-r--r--storage/maria/ma_pagecaches.c105
-rw-r--r--storage/maria/ma_panic.c134
-rw-r--r--storage/maria/ma_preload.c133
-rw-r--r--storage/maria/ma_range.c295
-rw-r--r--storage/maria/ma_recovery.c2249
-rw-r--r--storage/maria/ma_recovery.h30
-rw-r--r--storage/maria/ma_rename.c139
-rw-r--r--storage/maria/ma_rfirst.c26
-rw-r--r--storage/maria/ma_rkey.c178
-rw-r--r--storage/maria/ma_rlast.c26
-rw-r--r--storage/maria/ma_rnext.c122
-rw-r--r--storage/maria/ma_rnext_same.c107
-rw-r--r--storage/maria/ma_rprev.c88
-rw-r--r--storage/maria/ma_rrnd.c44
-rw-r--r--storage/maria/ma_rsame.c69
-rw-r--r--storage/maria/ma_rsamepos.c58
-rw-r--r--storage/maria/ma_rt_index.c1140
-rw-r--r--storage/maria/ma_rt_index.h49
-rw-r--r--storage/maria/ma_rt_key.c109
-rw-r--r--storage/maria/ma_rt_key.h32
-rw-r--r--storage/maria/ma_rt_mbr.c806
-rw-r--r--storage/maria/ma_rt_mbr.h38
-rw-r--r--storage/maria/ma_rt_split.c362
-rw-r--r--storage/maria/ma_rt_test.c473
-rw-r--r--storage/maria/ma_scan.c60
-rw-r--r--storage/maria/ma_search.c1934
-rw-r--r--storage/maria/ma_sort.c1058
-rw-r--r--storage/maria/ma_sp_defs.h47
-rw-r--r--storage/maria/ma_sp_key.c299
-rw-r--r--storage/maria/ma_sp_test.c568
-rw-r--r--storage/maria/ma_static.c79
-rw-r--r--storage/maria/ma_statrec.c294
-rw-r--r--storage/maria/ma_test1.c846
-rw-r--r--storage/maria/ma_test2.c1180
-rw-r--r--storage/maria/ma_test3.c500
-rw-r--r--storage/maria/ma_test_all.res62
-rwxr-xr-xstorage/maria/ma_test_all.sh245
-rwxr-xr-xstorage/maria/ma_test_recovery210
-rw-r--r--storage/maria/ma_test_recovery.expected1123
-rw-r--r--storage/maria/ma_unique.c235
-rw-r--r--storage/maria/ma_update.c250
-rw-r--r--storage/maria/ma_write.c1102
-rw-r--r--storage/maria/maria_chk.c1841
-rw-r--r--storage/maria/maria_def.h958
-rw-r--r--storage/maria/maria_ftdump.c279
-rw-r--r--storage/maria/maria_pack.c3227
-rw-r--r--storage/maria/maria_read_log.c200
-rwxr-xr-xstorage/maria/maria_rename.sh17
-rw-r--r--storage/maria/plug.in8
-rw-r--r--storage/maria/tablockman.c676
-rw-r--r--storage/maria/tablockman.h87
-rwxr-xr-xstorage/maria/test_pack10
-rw-r--r--storage/maria/trnman.c743
-rw-r--r--storage/maria/trnman.h59
-rw-r--r--storage/maria/trnman_public.h60
-rw-r--r--storage/maria/unittest/Makefile.am97
-rw-r--r--storage/maria/unittest/lockman-t.c309
-rw-r--r--storage/maria/unittest/lockman1-t.c335
-rw-r--r--storage/maria/unittest/lockman2-t.c361
-rw-r--r--storage/maria/unittest/ma_control_file-t.c447
-rw-r--r--storage/maria/unittest/ma_maria_log_cleanup.c45
-rw-r--r--storage/maria/unittest/ma_pagecache_consist.c459
-rw-r--r--storage/maria/unittest/ma_pagecache_single.c588
-rw-r--r--storage/maria/unittest/ma_test_loghandler-t.c617
-rw-r--r--storage/maria/unittest/ma_test_loghandler_first_lsn-t.c147
-rw-r--r--storage/maria/unittest/ma_test_loghandler_max_lsn-t.c140
-rw-r--r--storage/maria/unittest/ma_test_loghandler_multigroup-t.c641
-rw-r--r--storage/maria/unittest/ma_test_loghandler_multithread-t.c479
-rw-r--r--storage/maria/unittest/ma_test_loghandler_noflush-t.c132
-rw-r--r--storage/maria/unittest/ma_test_loghandler_pagecache-t.c159
-rw-r--r--storage/maria/unittest/ma_test_loghandler_purge-t.c176
-rw-r--r--storage/maria/unittest/test_file.c68
-rw-r--r--storage/maria/unittest/test_file.h14
-rw-r--r--storage/maria/unittest/trnman-t.c194
-rw-r--r--storage/myisam/Makefile.am4
-rw-r--r--storage/myisam/ft_boolean_search.c11
-rw-r--r--storage/myisam/ft_eval.c2
-rw-r--r--storage/myisam/ft_myisam.c36
-rw-r--r--storage/myisam/ft_nlq_search.c2
-rw-r--r--storage/myisam/ft_parser.c2
-rw-r--r--storage/myisam/ft_static.c14
-rw-r--r--storage/myisam/ft_stopwords.c19
-rw-r--r--storage/myisam/ft_test1.c4
-rw-r--r--storage/myisam/ft_update.c4
-rw-r--r--storage/myisam/fulltext.h10
-rw-r--r--storage/myisam/ha_myisam.cc45
-rw-r--r--storage/myisam/ha_myisam.h7
-rw-r--r--storage/myisam/mi_check.c169
-rw-r--r--storage/myisam/mi_checksum.c4
-rw-r--r--storage/myisam/mi_close.c1
-rw-r--r--storage/myisam/mi_create.c46
-rw-r--r--storage/myisam/mi_dbug.c1
-rw-r--r--storage/myisam/mi_delete.c14
-rw-r--r--storage/myisam/mi_dynrec.c18
-rw-r--r--storage/myisam/mi_extra.c7
-rw-r--r--storage/myisam/mi_key.c2
-rw-r--r--storage/myisam/mi_locking.c52
-rw-r--r--storage/myisam/mi_log.c2
-rw-r--r--storage/myisam/mi_open.c17
-rw-r--r--storage/myisam/mi_packrec.c7
-rw-r--r--storage/myisam/mi_range.c2
-rw-r--r--storage/myisam/mi_rkey.c2
-rw-r--r--storage/myisam/mi_search.c10
-rw-r--r--storage/myisam/mi_test1.c8
-rw-r--r--storage/myisam/mi_test2.c8
-rw-r--r--storage/myisam/mi_unique.c2
-rw-r--r--storage/myisam/mi_update.c2
-rw-r--r--storage/myisam/mi_write.c12
-rw-r--r--storage/myisam/myisamchk.c34
-rw-r--r--storage/myisam/myisamdef.h874
-rw-r--r--storage/myisam/myisamlog.c2
-rw-r--r--storage/myisam/myisampack.c8
-rw-r--r--storage/myisam/rt_index.c4
-rw-r--r--storage/myisam/sort.c14
-rw-r--r--storage/myisam/sp_test.c2
-rw-r--r--storage/myisammrg/ha_myisammrg.cc9
-rw-r--r--storage/myisammrg/ha_myisammrg.h4
172 files changed, 75489 insertions, 750 deletions
diff --git a/storage/Makefile.am b/storage/Makefile.am
index b978453d29d..eec499aabf4 100644
--- a/storage/Makefile.am
+++ b/storage/Makefile.am
@@ -19,7 +19,12 @@ AUTOMAKE_OPTIONS = foreign
# These are built from source in the Docs directory
EXTRA_DIST =
-SUBDIRS = @mysql_se_dirs@
+# Until we remove fulltext-related references from Maria to MyISAM
+# MyISAM must be built before Maria, which is not the case by default
+# because of alphabetical order
+# So we put myisam first; this is very ugly regarding plugins' logic
+# but it works, and we'll remove it soon.
+SUBDIRS = myisam @mysql_se_dirs@
# Don't update the files from bitkeeper
%::SCCS/s.%
diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc
index 9a7781e017d..394d00d20b7 100644
--- a/storage/csv/ha_tina.cc
+++ b/storage/csv/ha_tina.cc
@@ -445,7 +445,7 @@ ha_tina::ha_tina(handlerton *hton, TABLE_SHARE *table_arg)
*/
current_position(0), next_position(0), local_saved_data_file_length(0),
file_buff(0), chain_alloced(0), chain_size(DEFAULT_CHAIN_LENGTH),
- local_data_file_version(0), records_is_known(0)
+ local_data_file_version(0), records_is_known(0), curr_lock_type(F_UNLCK)
{
/* Set our original buffers from pre-allocated memory */
buffer.set((char*)byte_buffer, IO_SIZE, &my_charset_bin);
@@ -1454,6 +1454,14 @@ int ha_tina::delete_all_rows()
DBUG_RETURN(rc);
}
+int ha_tina::external_lock(THD *thd __attribute__((unused)), int lock_type)
+{
+ if (lock_type==F_UNLCK && curr_lock_type == F_WRLCK)
+ update_status();
+ curr_lock_type= lock_type;
+ return 0;
+}
+
/*
Called by the database to lock the table. Keep in mind that this
is an internal lock.
@@ -1468,7 +1476,7 @@ THR_LOCK_DATA **ha_tina::store_lock(THD *thd,
return to;
}
-/*
+/*
Create a table. You do not want to leave the table open after a call to
this (the database will call ::open() if it needs to).
*/
diff --git a/storage/csv/ha_tina.h b/storage/csv/ha_tina.h
index 5ce09783b9b..9a9c2399745 100644
--- a/storage/csv/ha_tina.h
+++ b/storage/csv/ha_tina.h
@@ -84,6 +84,8 @@ class ha_tina: public handler
bool records_is_known;
private:
+ int curr_lock_type;
+
bool get_write_pos(off_t *end_pos, tina_set *closest_hole);
int open_update_temp_file_if_needed();
int init_tina_writer();
@@ -154,6 +156,8 @@ public:
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
+ int external_lock(THD *thd, int lock_type);
+
THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
enum thr_lock_type lock_type);
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt
new file mode 100644
index 00000000000..cfe23054e2f
--- /dev/null
+++ b/storage/maria/CMakeLists.txt
@@ -0,0 +1 @@
+# empty for the moment; will fill it when we build under Windows
diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am
new file mode 100644
index 00000000000..2bd9b7db922
--- /dev/null
+++ b/storage/maria/Makefile.am
@@ -0,0 +1,172 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+MYSQLDATAdir = $(localstatedir)
+MYSQLSHAREdir = $(pkgdatadir)
+MYSQLBASEdir= $(prefix)
+MYSQLLIBdir= $(pkglibdir)
+INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include \
+ -I$(top_srcdir)/regex \
+ -I$(top_srcdir)/sql \
+ -I$(srcdir)
+WRAPLIBS=
+
+LDADD =
+
+DEFS = @DEFS@
+
+# "." is needed first because tests in unittest need libmaria
+SUBDIRS = . unittest
+
+EXTRA_DIST = ma_test_all.sh ma_test_all.res ma_ft_stem.c CMakeLists.txt plug.in ma_test_recovery
+pkgdata_DATA = ma_test_all ma_test_all.res ma_test_recovery
+pkglib_LIBRARIES = libmaria.a
+bin_PROGRAMS = maria_chk maria_pack maria_ftdump maria_read_log
+maria_chk_DEPENDENCIES= $(LIBRARIES)
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Maria).
+# For now, it imposes that storage/myisam be built before storage/maria.
+maria_chk_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+maria_pack_DEPENDENCIES=$(LIBRARIES)
+maria_pack_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+maria_read_log_DEPENDENCIES=$(LIBRARIES)
+maria_read_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+noinst_PROGRAMS = ma_test1 ma_test2 ma_test3 ma_rt_test ma_sp_test
+noinst_HEADERS = maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \
+ ma_sp_defs.h ma_fulltext.h ma_ftdefs.h ma_ft_test1.h \
+ ma_ft_eval.h trnman.h lockman.h tablockman.h \
+ ma_control_file.h ha_maria.h ma_blockrec.h \
+ ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \
+ ma_checkpoint.h ma_recovery.h ma_commit.h \
+ trnman_public.h
+ma_test1_DEPENDENCIES= $(LIBRARIES)
+ma_test1_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test2_DEPENDENCIES= $(LIBRARIES)
+ma_test2_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test3_DEPENDENCIES= $(LIBRARIES)
+ma_test3_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+#ma_ft_test1_DEPENDENCIES= $(LIBRARIES)
+#ma_ft_eval_DEPENDENCIES= $(LIBRARIES)
+maria_ftdump_DEPENDENCIES= $(LIBRARIES)
+maria_ftdump_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_rt_test_DEPENDENCIES= $(LIBRARIES)
+ma_rt_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_sp_test_DEPENDENCIES= $(LIBRARIES)
+ma_sp_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+libmaria_a_SOURCES = ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \
+ ma_rnext.c ma_rnext_same.c \
+ ma_search.c ma_page.c ma_key.c ma_locking.c \
+ ma_rrnd.c ma_scan.c ma_cache.c \
+ ma_statrec.c ma_packrec.c ma_dynrec.c \
+ ma_blockrec.c ma_bitmap.c \
+ ma_update.c ma_write.c ma_unique.c \
+ ma_delete.c \
+ ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c \
+ ma_rsamepos.c ma_panic.c ma_close.c ma_create.c\
+ ma_range.c ma_dbug.c ma_checksum.c \
+ ma_changed.c ma_static.c ma_delete_all.c \
+ ma_delete_table.c ma_rename.c ma_check.c \
+ ma_keycache.c ma_preload.c ma_ft_parser.c \
+ ma_ft_update.c ma_ft_boolean_search.c \
+ ma_ft_nlq_search.c ft_maria.c ma_sort.c \
+ ha_maria.cc trnman.c lockman.c tablockman.c \
+ ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c \
+ ma_sp_key.c ma_control_file.c ma_loghandler.c \
+ ma_pagecache.c ma_pagecaches.c \
+ ma_checkpoint.c ma_recovery.c ma_commit.c
+CLEANFILES = test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA?
+
+SUFFIXES = .sh
+
+.sh:
+ @RM@ -f $@ $@-t
+ @SED@ \
+ -e 's!@''bindir''@!$(bindir)!g' \
+ -e 's!@''scriptdir''@!$(bindir)!g' \
+ -e 's!@''prefix''@!$(prefix)!g' \
+ -e 's!@''datadir''@!$(datadir)!g' \
+ -e 's!@''localstatedir''@!$(localstatedir)!g' \
+ -e 's!@''libexecdir''@!$(libexecdir)!g' \
+ -e 's!@''CC''@!@CC@!'\
+ -e 's!@''CXX''@!@CXX@!'\
+ -e 's!@''GXX''@!@GXX@!'\
+ -e 's!@''PERL''@!@PERL@!' \
+ -e 's!@''CFLAGS''@!@SAVE_CFLAGS@!'\
+ -e 's!@''CXXFLAGS''@!@SAVE_CXXFLAGS@!'\
+ -e 's!@''LDFLAGS''@!@SAVE_LDFLAGS@!'\
+ -e 's!@''VERSION''@!@VERSION@!' \
+ -e 's!@''MYSQL_SERVER_SUFFIX''@!@MYSQL_SERVER_SUFFIX@!' \
+ -e 's!@''COMPILATION_COMMENT''@!@COMPILATION_COMMENT@!' \
+ -e 's!@''MACHINE_TYPE''@!@MACHINE_TYPE@!' \
+ -e 's!@''HOSTNAME''@!@HOSTNAME@!' \
+ -e 's!@''SYSTEM_TYPE''@!@SYSTEM_TYPE@!' \
+ -e 's!@''CHECK_PID''@!@CHECK_PID@!' \
+ -e 's!@''FIND_PROC''@!@FIND_PROC@!' \
+ -e 's!@''MYSQLD_DEFAULT_SWITCHES''@!@MYSQLD_DEFAULT_SWITCHES@!' \
+ -e 's!@''MYSQL_UNIX_ADDR''@!@MYSQL_UNIX_ADDR@!' \
+ -e 's!@''TARGET_LINUX''@!@TARGET_LINUX@!' \
+ -e "s!@""CONF_COMMAND""@!@CONF_COMMAND@!" \
+ -e 's!@''MYSQLD_USER''@!@MYSQLD_USER@!' \
+ -e 's!@''sysconfdir''@!@sysconfdir@!' \
+ -e 's!@''SHORT_MYSQL_INTRO''@!@SHORT_MYSQL_INTRO@!' \
+ -e 's!@''SHARED_LIB_VERSION''@!@SHARED_LIB_VERSION@!' \
+ -e 's!@''MYSQL_BASE_VERSION''@!@MYSQL_BASE_VERSION@!' \
+ -e 's!@''MYSQL_NO_DASH_VERSION''@!@MYSQL_NO_DASH_VERSION@!' \
+ -e 's!@''MYSQL_TCP_PORT''@!@MYSQL_TCP_PORT@!' \
+ -e 's!@''PERL_DBI_VERSION''@!@PERL_DBI_VERSION@!' \
+ -e 's!@''PERL_DBD_VERSION''@!@PERL_DBD_VERSION@!' \
+ -e 's!@''PERL_DATA_DUMPER''@!@PERL_DATA_DUMPER@!' \
+ $< > $@-t
+ @CHMOD@ +x $@-t
+ @MV@ $@-t $@
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c
new file mode 100644
index 00000000000..1b082f904d0
--- /dev/null
+++ b/storage/maria/ft_maria.c
@@ -0,0 +1,48 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*
+ This function is for interface functions between fulltext and maria
+*/
+
+#include "ma_ftdefs.h"
+
+FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr,
+ uchar *query, uint query_len, CHARSET_INFO *cs,
+ uchar *record)
+{
+ FT_INFO *res;
+ if (flags & FT_BOOL)
+ res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query,
+ query_len, cs);
+ else
+ res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, query_len,
+ flags, record);
+ return res;
+}
+
+const struct _ft_vft _ma_ft_vft_nlq = {
+ maria_ft_nlq_read_next, maria_ft_nlq_find_relevance,
+ maria_ft_nlq_close_search, maria_ft_nlq_get_relevance,
+ maria_ft_nlq_reinit_search
+};
+const struct _ft_vft _ma_ft_vft_boolean = {
+ maria_ft_boolean_read_next, maria_ft_boolean_find_relevance,
+ maria_ft_boolean_close_search, maria_ft_boolean_get_relevance,
+ maria_ft_boolean_reinit_search
+};
+
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
new file mode 100644
index 00000000000..678b88063db
--- /dev/null
+++ b/storage/maria/ha_maria.cc
@@ -0,0 +1,2436 @@
+/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#include <m_ctype.h>
+#include <myisampack.h>
+#include <my_bit.h>
+#include "ha_maria.h"
+#include "trnman_public.h"
+
+C_MODE_START
+#include "maria_def.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_recovery.h"
+C_MODE_END
+
+/*
+ Note that in future versions, only *transactional* Maria tables can
+ rollback, so this flag should be up or down conditionally.
+*/
+#define MARIA_CANNOT_ROLLBACK HA_NO_TRANSACTIONS
+#ifdef MARIA_CANNOT_ROLLBACK
+#define trans_register_ha(A, B, C) do { /* nothing */ } while(0)
+#endif
+
+/**
+ @todo For now there is no way for a user to set a different value of
+ maria_recover_options, i.e. auto-check-and-repair is always disabled.
+ We could enable it. As the auto-repair is initiated when opened from the
+ SQL layer (open_unireg_entry(), check_and_repair()), it does not happen
+ when Maria's Recovery internally opens the table to apply log records to
+ it, which is good. It would happen only after Recovery, if the table is
+ still corrupted.
+*/
+ulong maria_recover_options= HA_RECOVER_NONE;
+static handlerton *maria_hton;
+
+/* bits in maria_recover_options */
+const char *maria_recover_names[]=
+{
+ "DEFAULT", "BACKUP", "FORCE", "QUICK", NullS
+};
+TYPELIB maria_recover_typelib=
+{
+ array_elements(maria_recover_names) - 1, "",
+ maria_recover_names, NULL
+};
+
+const char *maria_stats_method_names[]=
+{
+ "nulls_unequal", "nulls_equal",
+ "nulls_ignored", NullS
+};
+TYPELIB maria_stats_method_typelib=
+{
+ array_elements(maria_stats_method_names) - 1, "",
+ maria_stats_method_names, NULL
+};
+
+
+/*****************************************************************************
+** MARIA tables
+*****************************************************************************/
+
+static handler *maria_create_handler(handlerton *hton,
+ TABLE_SHARE * table,
+ MEM_ROOT *mem_root)
+{
+ return new (mem_root) ha_maria(hton, table);
+}
+
+
+// collect errors printed by maria_check routines
+
+static void _ma_check_print_msg(HA_CHECK *param, const char *msg_type,
+ const char *fmt, va_list args)
+{
+ THD *thd= (THD *) param->thd;
+ Protocol *protocol= thd->protocol;
+ uint length, msg_length;
+ char msgbuf[MARIA_MAX_MSG_BUF];
+ char name[NAME_LEN * 2 + 2];
+
+ msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args);
+ msgbuf[sizeof(msgbuf) - 1]= 0; // healthy paranoia
+
+ DBUG_PRINT(msg_type, ("message: %s", msgbuf));
+
+ if (!thd->vio_ok())
+ {
+ sql_print_error(msgbuf);
+ return;
+ }
+
+ if (param->testflag &
+ (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR))
+ {
+ my_message(ER_NOT_KEYFILE, msgbuf, MYF(MY_WME));
+ return;
+ }
+ length= (uint) (strxmov(name, param->db_name, ".", param->table_name,
+ NullS) - name);
+ /*
+ TODO: switch from protocol to push_warning here. The main reason we didn't
+ it yet is parallel repair. Due to following trace:
+ ma_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
+
+ Also we likely need to lock mutex here (in both cases with protocol and
+ push_warning).
+ */
+ protocol->prepare_for_resend();
+ protocol->store(name, length, system_charset_info);
+ protocol->store(param->op_name, system_charset_info);
+ protocol->store(msg_type, system_charset_info);
+ protocol->store(msgbuf, msg_length, system_charset_info);
+ if (protocol->write())
+ sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
+ msgbuf);
+ return;
+}
+
+
+/*
+ Convert TABLE object to Maria key and column definition
+
+ SYNOPSIS
+ table2maria()
+ table_arg in TABLE object.
+ keydef_out out Maria key definition.
+ recinfo_out out Maria column definition.
+ records_out out Number of fields.
+
+ DESCRIPTION
+ This function will allocate and initialize Maria key and column
+ definition for further use in ma_create or for a check for underlying
+ table conformance in merge engine.
+
+ RETURN VALUE
+ 0 OK
+ # error code
+*/
+
+int table2maria(TABLE *table_arg, MARIA_KEYDEF **keydef_out,
+ MARIA_COLUMNDEF **recinfo_out, uint *records_out)
+{
+ uint i, j, recpos, minpos, fieldpos, temp_length, length;
+ enum ha_base_keytype type= HA_KEYTYPE_BINARY;
+ uchar *record;
+ KEY *pos;
+ MARIA_KEYDEF *keydef;
+ MARIA_COLUMNDEF *recinfo, *recinfo_pos;
+ HA_KEYSEG *keyseg;
+ TABLE_SHARE *share= table_arg->s;
+ uint options= share->db_options_in_use;
+ DBUG_ENTER("table2maria");
+
+ if (!(my_multi_malloc(MYF(MY_WME),
+ recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF),
+ keydef_out, share->keys * sizeof(MARIA_KEYDEF),
+ &keyseg,
+ (share->key_parts + share->keys) * sizeof(HA_KEYSEG),
+ NullS)))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */
+ keydef= *keydef_out;
+ recinfo= *recinfo_out;
+ pos= table_arg->key_info;
+ for (i= 0; i < share->keys; i++, pos++)
+ {
+ keydef[i].flag= (pos->flags & (HA_NOSAME | HA_FULLTEXT | HA_SPATIAL));
+ keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ?
+ (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) :
+ pos->algorithm;
+ keydef[i].block_length= pos->block_size;
+ keydef[i].seg= keyseg;
+ keydef[i].keysegs= pos->key_parts;
+ for (j= 0; j < pos->key_parts; j++)
+ {
+ Field *field= pos->key_part[j].field;
+ type= field->key_type();
+ keydef[i].seg[j].flag= pos->key_part[j].key_part_flag;
+
+ if (options & HA_OPTION_PACK_KEYS ||
+ (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY |
+ HA_SPACE_PACK_USED)))
+ {
+ if (pos->key_part[j].length > 8 &&
+ (type == HA_KEYTYPE_TEXT ||
+ type == HA_KEYTYPE_NUM ||
+ (type == HA_KEYTYPE_BINARY && !field->zero_pack())))
+ {
+ /* No blobs here */
+ if (j == 0)
+ keydef[i].flag|= HA_PACK_KEY;
+ if (!(field->flags & ZEROFILL_FLAG) &&
+ (field->type() == MYSQL_TYPE_STRING ||
+ field->type() == MYSQL_TYPE_VAR_STRING ||
+ ((int) (pos->key_part[j].length - field->decimals())) >= 4))
+ keydef[i].seg[j].flag|= HA_SPACE_PACK;
+ }
+ else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16))
+ keydef[i].flag|= HA_BINARY_PACK_KEY;
+ }
+ keydef[i].seg[j].type= (int) type;
+ keydef[i].seg[j].start= pos->key_part[j].offset;
+ keydef[i].seg[j].length= pos->key_part[j].length;
+ keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end=
+ keydef[i].seg[j].bit_length= 0;
+ keydef[i].seg[j].bit_pos= 0;
+ keydef[i].seg[j].language= field->charset()->number;
+
+ if (field->null_ptr)
+ {
+ keydef[i].seg[j].null_bit= field->null_bit;
+ keydef[i].seg[j].null_pos= (uint) (field->null_ptr-
+ (uchar*) table_arg->record[0]);
+ }
+ else
+ {
+ keydef[i].seg[j].null_bit= 0;
+ keydef[i].seg[j].null_pos= 0;
+ }
+ if (field->type() == MYSQL_TYPE_BLOB ||
+ field->type() == MYSQL_TYPE_GEOMETRY)
+ {
+ keydef[i].seg[j].flag|= HA_BLOB_PART;
+ /* save number of bytes used to pack length */
+ keydef[i].seg[j].bit_start= (uint) (field->pack_length() -
+ share->blob_ptr_size);
+ }
+ else if (field->type() == MYSQL_TYPE_BIT)
+ {
+ keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len;
+ keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs;
+ keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr -
+ (uchar*) table_arg->record[0]);
+ }
+ }
+ keyseg+= pos->key_parts;
+ }
+ if (table_arg->found_next_number_field)
+ keydef[share->next_number_index].flag|= HA_AUTO_KEY;
+ record= table_arg->record[0];
+ recpos= 0;
+ recinfo_pos= recinfo;
+ while (recpos < (uint) share->reclength)
+ {
+ Field **field, *found= 0;
+ minpos= share->reclength;
+ length= 0;
+
+ for (field= table_arg->field; *field; field++)
+ {
+ if ((fieldpos= (*field)->offset(record)) >= recpos &&
+ fieldpos <= minpos)
+ {
+ /* skip null fields */
+ if (!(temp_length= (*field)->pack_length_in_rec()))
+ continue; /* Skip null-fields */
+ if (! found || fieldpos < minpos ||
+ (fieldpos == minpos && temp_length < length))
+ {
+ minpos= fieldpos;
+ found= *field;
+ length= temp_length;
+ }
+ }
+ }
+ DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d",
+ (long) found, recpos, minpos, length));
+ if (recpos != minpos)
+ { // Reserved space (Null bits?)
+ bzero((char*) recinfo_pos, sizeof(*recinfo_pos));
+ recinfo_pos->type= FIELD_NORMAL;
+ recinfo_pos++->length= (uint16) (minpos - recpos);
+ }
+ if (!found)
+ break;
+
+ if (found->flags & BLOB_FLAG)
+ recinfo_pos->type= FIELD_BLOB;
+ else if (found->type() == MYSQL_TYPE_VARCHAR)
+ recinfo_pos->type= FIELD_VARCHAR;
+ else if (!(options & HA_OPTION_PACK_RECORD) ||
+ (found->zero_pack() && (found->flags & PRI_KEY_FLAG)))
+ recinfo_pos->type= FIELD_NORMAL;
+ else if (found->zero_pack())
+ recinfo_pos->type= FIELD_SKIP_ZERO;
+ else
+ recinfo_pos->type= ((length <= 3 ||
+ (found->flags & ZEROFILL_FLAG)) ?
+ FIELD_NORMAL :
+ found->type() == MYSQL_TYPE_STRING ||
+ found->type() == MYSQL_TYPE_VAR_STRING ?
+ FIELD_SKIP_ENDSPACE :
+ FIELD_SKIP_PRESPACE);
+ if (found->null_ptr)
+ {
+ recinfo_pos->null_bit= found->null_bit;
+ recinfo_pos->null_pos= (uint) (found->null_ptr -
+ (uchar*) table_arg->record[0]);
+ }
+ else
+ {
+ recinfo_pos->null_bit= 0;
+ recinfo_pos->null_pos= 0;
+ }
+ (recinfo_pos++)->length= (uint16) length;
+ recpos= minpos + length;
+ DBUG_PRINT("loop", ("length: %d type: %d",
+ recinfo_pos[-1].length,recinfo_pos[-1].type));
+ }
+ *records_out= (uint) (recinfo_pos - recinfo);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check for underlying table conformance
+
+ SYNOPSIS
+ maria_check_definition()
+ t1_keyinfo in First table key definition
+ t1_recinfo in First table record definition
+ t1_keys in Number of keys in first table
+ t1_recs in Number of records in first table
+ t2_keyinfo in Second table key definition
+ t2_recinfo in Second table record definition
+ t2_keys in Number of keys in second table
+ t2_recs in Number of records in second table
+ strict in Strict check switch
+
+ DESCRIPTION
+ This function compares two Maria definitions. By intention it was done
+ to compare merge table definition against underlying table definition.
+ It may also be used to compare dot-frm and MAI definitions of Maria
+ table as well to compare different Maria table definitions.
+
+ For merge table it is not required that number of keys in merge table
+ must exactly match number of keys in underlying table. When calling this
+ function for underlying table conformance check, 'strict' flag must be
+ set to false, and converted merge definition must be passed as t1_*.
+
+ Otherwise 'strict' flag must be set to 1 and it is not required to pass
+ converted dot-frm definition as t1_*.
+
+ RETURN VALUE
+ 0 - Equal definitions.
+ 1 - Different definitions.
+
+ TODO
+ - compare FULLTEXT keys;
+ - compare SPATIAL keys;
+ - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly
+ (should be corretly detected in table2maria).
+*/
+int maria_check_definition(MARIA_KEYDEF *t1_keyinfo,
+ MARIA_COLUMNDEF *t1_recinfo,
+ uint t1_keys, uint t1_recs,
+ MARIA_KEYDEF *t2_keyinfo,
+ MARIA_COLUMNDEF *t2_recinfo,
+ uint t2_keys, uint t2_recs, bool strict)
+{
+ uint i, j;
+ DBUG_ENTER("maria_check_definition");
+ if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys))
+ {
+ DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u",
+ t1_keys, t2_keys));
+ DBUG_RETURN(1);
+ }
+ if (t1_recs != t2_recs)
+ {
+ DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u",
+ t1_recs, t2_recs));
+ DBUG_RETURN(1);
+ }
+ for (i= 0; i < t1_keys; i++)
+ {
+ HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg;
+ HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg;
+ if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT)
+ continue;
+ else if (t1_keyinfo[i].flag & HA_FULLTEXT ||
+ t2_keyinfo[i].flag & HA_FULLTEXT)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d",
+ test(t1_keyinfo[i].flag & HA_FULLTEXT),
+ test(t2_keyinfo[i].flag & HA_FULLTEXT)));
+ DBUG_RETURN(1);
+ }
+ if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL)
+ continue;
+ else if (t1_keyinfo[i].flag & HA_SPATIAL ||
+ t2_keyinfo[i].flag & HA_SPATIAL)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d",
+ test(t1_keyinfo[i].flag & HA_SPATIAL),
+ test(t2_keyinfo[i].flag & HA_SPATIAL)));
+ DBUG_RETURN(1);
+ }
+ if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs ||
+ t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d",
+ t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg));
+ DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d",
+ t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg));
+ DBUG_RETURN(1);
+ }
+ for (j= t1_keyinfo[i].keysegs; j--;)
+ {
+ if (t1_keysegs[j].type != t2_keysegs[j].type ||
+ t1_keysegs[j].language != t2_keysegs[j].language ||
+ t1_keysegs[j].null_bit != t2_keysegs[j].null_bit ||
+ t1_keysegs[j].length != t2_keysegs[j].length)
+ {
+ DBUG_PRINT("error", ("Key segment %d (key %d) has different "
+ "definition", j, i));
+ DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, "
+ "t1_length=%d",
+ t1_keysegs[j].type, t1_keysegs[j].language,
+ t1_keysegs[j].null_bit, t1_keysegs[j].length));
+ DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, "
+ "t2_length=%d",
+ t2_keysegs[j].type, t2_keysegs[j].language,
+ t2_keysegs[j].null_bit, t2_keysegs[j].length));
+
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ for (i= 0; i < t1_recs; i++)
+ {
+ MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i];
+ MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i];
+ /*
+ FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create,
+ see NOTE1 in ma_create.c
+ */
+ if ((t1_rec->type != t2_rec->type &&
+ !(t1_rec->type == (int) FIELD_SKIP_ZERO &&
+ t1_rec->length == 1 &&
+ t2_rec->type == (int) FIELD_NORMAL)) ||
+ t1_rec->length != t2_rec->length ||
+ t1_rec->null_bit != t2_rec->null_bit)
+ {
+ DBUG_PRINT("error", ("Field %d has different definition", i));
+ DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d",
+ t1_rec->type, t1_rec->length, t1_rec->null_bit));
+ DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d",
+ t2_rec->type, t2_rec->length, t2_rec->null_bit));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+extern "C" {
+
+volatile int *_ma_killed_ptr(HA_CHECK *param)
+{
+ /* In theory Unsafe conversion, but should be ok for now */
+ return (int*) &(((THD *) (param->thd))->killed);
+}
+
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_error");
+ param->error_printed |= 1;
+ param->out_flag |= O_DATA_LOST;
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "error", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_info");
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "info", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_warning");
+ param->warning_printed= 1;
+ param->out_flag |= O_DATA_LOST;
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "warning", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+}
+
+
+ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg):
+handler(hton, table_arg), file(0),
+int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
+ HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+ HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
+ HA_FILE_BASED | HA_CAN_GEOMETRY | MARIA_CANNOT_ROLLBACK |
+ HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS |
+ HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT),
+can_enable_indexes(1)
+{}
+
+
+handler *ha_maria::clone(MEM_ROOT *mem_root)
+{
+ ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root));
+ if (new_handler)
+ new_handler->file->state= file->state;
+ return new_handler;
+}
+
+
+static const char *ha_maria_exts[]=
+{
+ MARIA_NAME_IEXT,
+ MARIA_NAME_DEXT,
+ NullS
+};
+
+
+const char **ha_maria::bas_ext() const
+{
+ return ha_maria_exts;
+}
+
+
+const char *ha_maria::index_type(uint key_number)
+{
+ return ((table->key_info[key_number].flags & HA_FULLTEXT) ?
+ "FULLTEXT" :
+ (table->key_info[key_number].flags & HA_SPATIAL) ?
+ "SPATIAL" :
+ (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ?
+ "RTREE" : "BTREE");
+}
+
+
+double ha_maria::scan_time()
+{
+ if (file->s->data_file_type == BLOCK_RECORD)
+ return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2;
+ return handler::scan_time();
+}
+
+/*
+ We need to be able to store at least two keys on an index page as the
+ splitting algorithms depends on this. (With only one key on a page
+ we also can't use any compression, which may make the index file much
+ larger)
+ We use HA_MAX_KEY_BUFF as this is a stack restriction imposed by the
+ handler interface.
+
+ We also need to reserve place for a record pointer (8) and 3 bytes
+ per key segment to store the length of the segment + possible null bytes.
+ These extra bytes are required here so that maria_create() will surely
+ accept any keys created which the returned key data storage length.
+*/
+
+uint ha_maria::max_supported_key_length() const
+{
+ uint tmp= (maria_max_key_length() - 8 - HA_MAX_KEY_SEG*3);
+ return min(HA_MAX_KEY_BUFF, tmp);
+}
+
+
+#ifdef HAVE_REPLICATION
+int ha_maria::net_read_dump(NET * net)
+{
+ int data_fd= file->dfile.file;
+ int error= 0;
+
+ my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+ for (;;)
+ {
+ ulong packet_len= my_net_read(net);
+ if (!packet_len)
+ break; // end of file
+ if (packet_len == packet_error)
+ {
+ sql_print_error("ha_maria::net_read_dump - read error ");
+ error= -1;
+ goto err;
+ }
+ if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len,
+ MYF(MY_WME | MY_FNABP)))
+ {
+ error= errno;
+ goto err;
+ }
+ }
+err:
+ return error;
+}
+
+
+int ha_maria::dump(THD * thd, int fd)
+{
+ MARIA_SHARE *share= file->s;
+ NET *net= &thd->net;
+ uint block_size= share->block_size;
+ my_off_t bytes_to_read= share->state.state.data_file_length;
+ int data_fd= file->dfile.file;
+ uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME));
+ if (!buf)
+ return ENOMEM;
+
+ int error= 0;
+ my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+ for (; bytes_to_read > 0;)
+ {
+ size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME));
+ if (bytes == MY_FILE_ERROR)
+ {
+ error= errno;
+ goto err;
+ }
+
+ if (fd >= 0)
+ {
+ if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP)))
+ {
+ error= errno ? errno : EPIPE;
+ goto err;
+ }
+ }
+ else
+ {
+ if (my_net_write(net, buf, bytes))
+ {
+ error= errno ? errno : EPIPE;
+ goto err;
+ }
+ }
+ bytes_to_read -= bytes;
+ }
+
+ if (fd < 0)
+ {
+ if (my_net_write(net, (uchar*) "", 0))
+ error= errno ? errno : EPIPE;
+ net_flush(net);
+ }
+
+err:
+ my_free((uchar*) buf, MYF(0));
+ return error;
+}
+#endif /* HAVE_REPLICATION */
+
+
+bool ha_maria::check_if_locking_is_allowed(uint sql_command,
+ ulong type, TABLE *table,
+ uint count, uint current,
+ uint *system_count,
+ bool called_by_privileged_thread)
+{
+ /*
+ To be able to open and lock for reading system tables like 'mysql.proc',
+ when we already have some tables opened and locked, and avoid deadlocks
+ we have to disallow write-locking of these tables with any other tables.
+ */
+ if (table->s->system_table &&
+ table->reginfo.lock_type >= TL_WRITE_ALLOW_WRITE)
+ (*system_count)++;
+
+ /* 'current' is an index, that's why '<=' below. */
+ if (*system_count > 0 && *system_count <= current)
+ {
+ my_error(ER_WRONG_LOCK_OF_SYSTEM_TABLE, MYF(0));
+ return FALSE;
+ }
+
+ /*
+ Deny locking of the log tables, which is incompatible with
+ concurrent insert. Unless called from a logger THD (general_log_thd
+ or slow_log_thd) or by a privileged thread.
+ */
+ if (!called_by_privileged_thread)
+ return check_if_log_table_locking_is_allowed(sql_command, type, table);
+
+ return TRUE;
+}
+
+
+ /* Name is here without an extension */
+
+int ha_maria::open(const char *name, int mode, uint test_if_locked)
+{
+ MARIA_KEYDEF *keyinfo;
+ MARIA_COLUMNDEF *recinfo= 0;
+ uint recs;
+ uint i;
+
+#ifdef NOT_USED
+ /*
+ If the user wants to have memory mapped data files, add an
+ open_flag. Do not memory map temporary tables because they are
+ expected to be inserted and thus extended a lot. Memory mapping is
+ efficient for files that keep their size, but very inefficient for
+ growing files. Using an open_flag instead of calling ma_extra(...
+ HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the
+ mapping is not repeated for every open, but just done on the initial
+ open, when the MyISAM share is created. Everytime the server
+ requires to open a new instance of a table it calls this method. We
+ will always supply HA_OPEN_MMAP for a permanent table. However, the
+ Maria storage engine will ignore this flag if this is a secondary
+ open of a table that is in use by other threads already (if the
+ Maria share exists already).
+ */
+ if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap)
+ test_if_locked|= HA_OPEN_MMAP;
+#endif
+
+ if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER)))
+ return (my_errno ? my_errno : -1);
+
+ /**
+ @todo ASK_MONTY
+ This is a protection for the case of a frm and MAI containing incompatible
+ table definitions (as in BUG#25908). This was merged from MyISAM.
+ But it breaks maria.test and ps_maria.test ("incorrect key file") if the
+ table is BLOCK_RECORD (does it have to do with column reordering done in
+ ma_create.c ?).
+ */
+ if (!table->s->tmp_table) /* No need to perform a check for tmp table */
+ {
+ if ((my_errno= table2maria(table, &keyinfo, &recinfo, &recs)))
+ {
+ /* purecov: begin inspected */
+ DBUG_PRINT("error", ("Failed to convert TABLE object to Maria "
+ "key and column definition"));
+ goto err;
+ /* purecov: end */
+ }
+#ifdef ASK_MONTY
+ if (maria_check_definition(keyinfo, recinfo, table->s->keys, recs,
+ file->s->keyinfo, file->s->columndef,
+ file->s->base.keys, file->s->base.fields, true))
+#else
+ if (0)
+#endif
+ {
+ /* purecov: begin inspected */
+ my_errno= HA_ERR_CRASHED;
+ goto err;
+ /* purecov: end */
+ }
+ }
+
+ if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE))
+ VOID(maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0));
+
+ info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED))
+ VOID(maria_extra(file, HA_EXTRA_WAIT_LOCK, 0));
+ if ((data_file_type= file->s->data_file_type) != STATIC_RECORD)
+ int_table_flags |= HA_REC_NOT_IN_SEQ;
+ if (!file->s->base.born_transactional)
+ {
+ /*
+ INSERT DELAYED cannot work with transactional tables (because it cannot
+ stand up to "when client gets ok the data is safe on disk": the record
+ may not even be inserted). In the future, we could enable it back (as a
+ client doing INSERT DELAYED knows the specificities; but we then should
+ make sure to regularly commit in the delayed_insert thread).
+ */
+ int_table_flags|= HA_CAN_INSERT_DELAYED;
+ }
+ if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ int_table_flags |= HA_HAS_CHECKSUM;
+
+ for (i= 0; i < table->s->keys; i++)
+ {
+ plugin_ref parser= table->key_info[i].parser;
+ if (table->key_info[i].flags & HA_USES_PARSER)
+ file->s->keyinfo[i].parser=
+ (struct st_mysql_ftparser *)plugin_decl(parser)->info;
+ table->key_info[i].block_size= file->s->keyinfo[i].block_length;
+ }
+ my_errno= 0;
+ goto end;
+ err:
+ this->close();
+ end:
+ /*
+ Both recinfo and keydef are allocated by my_multi_malloc(), thus only
+ recinfo must be freed.
+ */
+ if (recinfo)
+ my_free((uchar*) recinfo, MYF(0));
+ return my_errno;
+}
+
+
+int ha_maria::close(void)
+{
+ MARIA_HA *tmp= file;
+ file= 0;
+ return maria_close(tmp);
+}
+
+
+int ha_maria::write_row(uchar * buf)
+{
+ ha_statistic_increment(&SSV::ha_write_count);
+
+ /* If we have a timestamp column, update it to the current time */
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+ table->timestamp_field->set_time();
+
+ /*
+ If we have an auto_increment column and we are writing a changed row
+ or a new row, then update the auto_increment value in the record.
+ */
+ if (table->next_number_field && buf == table->record[0])
+ {
+ int error;
+ if ((error= update_auto_increment()))
+ return error;
+ }
+ return maria_write(file, buf);
+}
+
+
+int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
+{
+ if (!file)
+ return HA_ADMIN_INTERNAL_ERROR;
+ int error;
+ HA_CHECK param;
+ MARIA_SHARE *share= file->s;
+ const char *old_proc_info= thd->proc_info;
+
+ thd->proc_info= "Checking table";
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "check";
+ param.db_name= table->s->db.str;
+ param.table_name= table->alias;
+ param.testflag= check_opt->flags | T_CHECK | T_SILENT;
+ param.stats_method= (enum_handler_stats_method) thd->variables.
+ maria_stats_method;
+
+ if (!(table->db_stat & HA_READ_ONLY))
+ param.testflag |= T_STATISTICS;
+ param.using_global_keycache= 1;
+
+ if (!maria_is_crashed(file) &&
+ (((param.testflag & T_CHECK_ONLY_CHANGED) &&
+ !(share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR)) &&
+ share->state.open_count == 0) ||
+ ((param.testflag & T_FAST) && (share->state.open_count ==
+ (uint) (share->global_changed ? 1 :
+ 0)))))
+ return HA_ADMIN_ALREADY_DONE;
+
+ error= maria_chk_status(&param, file); // Not fatal
+ error= maria_chk_size(&param, file);
+ if (!error)
+ error |= maria_chk_del(&param, file, param.testflag);
+ if (!error)
+ error= maria_chk_key(&param, file);
+ if (!error)
+ {
+ if ((!(param.testflag & T_QUICK) &&
+ ((share->options &
+ (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ||
+ (param.testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file))
+ {
+ uint old_testflag= param.testflag;
+ param.testflag |= T_MEDIUM;
+ if (!(error= init_io_cache(&param.read_cache, file->dfile.file,
+ my_default_record_cache_size, READ_CACHE,
+ share->pack.header_length, 1, MYF(MY_WME))))
+ {
+ error= maria_chk_data_link(&param, file, param.testflag & T_EXTEND);
+ end_io_cache(&(param.read_cache));
+ }
+ param.testflag= old_testflag;
+ }
+ }
+ if (!error)
+ {
+ if ((share->state.changed & (STATE_CHANGED |
+ STATE_CRASHED_ON_REPAIR |
+ STATE_CRASHED | STATE_NOT_ANALYZED)) ||
+ (param.testflag & T_STATISTICS) || maria_is_crashed(file))
+ {
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ pthread_mutex_lock(&share->intern_lock);
+ share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR);
+ if (!(table->db_stat & HA_READ_ONLY))
+ error= maria_update_state_info(&param, file, UPDATE_TIME | UPDATE_OPEN_COUNT |
+ UPDATE_STAT);
+ pthread_mutex_unlock(&share->intern_lock);
+ info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+ HA_STATUS_CONST);
+ }
+ }
+ else if (!maria_is_crashed(file) && !thd->killed)
+ {
+ maria_mark_crashed(file);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+
+ thd->proc_info= old_proc_info;
+ return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+/*
+ Analyze the key distribution in the table
+ As the table may be only locked for read, we have to take into account that
+ two threads may do an analyze at the same time!
+*/
+
+int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt)
+{
+ int error= 0;
+ HA_CHECK param;
+ MARIA_SHARE *share= file->s;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "analyze";
+ param.db_name= table->s->db.str;
+ param.table_name= table->alias;
+ param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS |
+ T_DONT_CHECK_CHECKSUM);
+ param.using_global_keycache= 1;
+ param.stats_method= (enum_handler_stats_method) thd->variables.
+ maria_stats_method;
+
+ if (!(share->state.changed & STATE_NOT_ANALYZED))
+ return HA_ADMIN_ALREADY_DONE;
+
+ error= maria_chk_key(&param, file);
+ if (!error)
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ error= maria_update_state_info(&param, file, UPDATE_STAT);
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ else if (!maria_is_crashed(file) && !thd->killed)
+ maria_mark_crashed(file);
+ return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+int ha_maria::restore(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ HA_CHECK_OPT tmp_check_opt;
+ char *backup_dir= thd->lex->backup_dir;
+ char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+ char table_name[FN_REFLEN];
+ int error;
+ const char *errmsg;
+ DBUG_ENTER("restore");
+
+ VOID(tablename_to_filename(table->s->table_name.str, table_name,
+ sizeof(table_name)));
+
+ if (fn_format_relative_to_data_home(src_path, table_name, backup_dir,
+ MARIA_NAME_DEXT))
+ DBUG_RETURN(HA_ADMIN_INVALID);
+
+ strxmov(dst_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+ if (my_copy(src_path, dst_path, MYF(MY_WME)))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed in my_copy (Error %d)";
+ goto err;
+ }
+
+ tmp_check_opt.init();
+ tmp_check_opt.flags |= T_VERY_SILENT | T_CALC_CHECKSUM | T_QUICK;
+ DBUG_RETURN(repair(thd, &tmp_check_opt));
+
+err:
+ {
+ HA_CHECK param;
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "restore";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg, my_errno);
+ DBUG_RETURN(error);
+ }
+}
+
+
+int ha_maria::backup(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ char *backup_dir= thd->lex->backup_dir;
+ char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+ char table_name[FN_REFLEN];
+ int error;
+ const char *errmsg;
+ DBUG_ENTER("ha_maria::backup");
+
+ VOID(tablename_to_filename(table->s->table_name.str, table_name,
+ sizeof(table_name)));
+
+ if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+ reg_ext))
+ {
+ errmsg= "Failed in fn_format() for .frm file (errno: %d)";
+ error= HA_ADMIN_INVALID;
+ goto err;
+ }
+
+ strxmov(src_path, table->s->normalized_path.str, reg_ext, NullS);
+ if (my_copy(src_path, dst_path,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed copying .frm file (errno: %d)";
+ goto err;
+ }
+
+ /* Change extension */
+ if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+ MARIA_NAME_DEXT))
+ {
+ errmsg= "Failed in fn_format() for .MYD file (errno: %d)";
+ error= HA_ADMIN_INVALID;
+ goto err;
+ }
+
+ strxmov(src_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+ if (_ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE,
+ FLUSH_KEEP))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed in flush (Error %d)";
+ goto err;
+ }
+ if (my_copy(src_path, dst_path,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+ {
+ errmsg= "Failed copying .MYD file (errno: %d)";
+ error= HA_ADMIN_FAILED;
+ goto err;
+ }
+ DBUG_RETURN(HA_ADMIN_OK);
+
+err:
+ {
+ HA_CHECK param;
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "backup";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg, my_errno);
+ DBUG_RETURN(error);
+ }
+}
+
+
+int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ HA_CHECK param;
+ ha_rows start_records;
+
+ if (!file)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "repair";
+ param.testflag= ((check_opt->flags & ~(T_EXTEND)) |
+ T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM |
+ (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT));
+ param.sort_buffer_length= check_opt->sort_buffer_size;
+ start_records= file->state->records;
+ while ((error= repair(thd, param, 0)) && param.retry_repair)
+ {
+ param.retry_repair= 0;
+ if (test_all_bits(param.testflag,
+ (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK)))
+ {
+ param.testflag &= ~T_RETRY_WITHOUT_QUICK;
+ sql_print_information("Retrying repair of: '%s' without quick",
+ table->s->path.str);
+ continue;
+ }
+ param.testflag &= ~T_QUICK;
+ if ((param.testflag & T_REP_BY_SORT))
+ {
+ param.testflag= (param.testflag & ~T_REP_BY_SORT) | T_REP;
+ sql_print_information("Retrying repair of: '%s' with keycache",
+ table->s->path.str);
+ continue;
+ }
+ break;
+ }
+ if (!error && start_records != file->state->records &&
+ !(check_opt->flags & T_VERY_SILENT))
+ {
+ char llbuff[22], llbuff2[22];
+ sql_print_information("Found %s of %s rows when repairing '%s'",
+ llstr(file->state->records, llbuff),
+ llstr(start_records, llbuff2),
+ table->s->path.str);
+ }
+ return error;
+}
+
+int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ if (!file)
+ return HA_ADMIN_INTERNAL_ERROR;
+ HA_CHECK param;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "optimize";
+ param.testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE |
+ T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX);
+ param.sort_buffer_length= check_opt->sort_buffer_size;
+ if ((error= repair(thd, param, 1)) && param.retry_repair)
+ {
+ sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying",
+ my_errno, param.db_name, param.table_name);
+ param.testflag &= ~T_REP_BY_SORT;
+ error= repair(thd, param, 1);
+ }
+ return error;
+}
+
+
+int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
+{
+ int error= 0;
+ uint local_testflag= param.testflag;
+ bool optimize_done= !do_optimize, statistics_done= 0;
+ const char *old_proc_info= thd->proc_info;
+ char fixed_name[FN_REFLEN];
+ MARIA_SHARE *share= file->s;
+ ha_rows rows= file->state->records;
+ DBUG_ENTER("ha_maria::repair");
+
+ /*
+ Normally this method is entered with a properly opened table. If the
+ repair fails, it can be repeated with more elaborate options. Under
+ special circumstances it can happen that a repair fails so that it
+ closed the data file and cannot re-open it. In this case file->dfile
+ is set to -1. We must not try another repair without an open data
+ file. (Bug #25289)
+ */
+ if (file->dfile.file == -1)
+ {
+ sql_print_information("Retrying repair of: '%s' failed. "
+ "Please try REPAIR EXTENDED or maria_chk",
+ table->s->path.str);
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ }
+
+ param.db_name= table->s->db.str;
+ param.table_name= table->alias;
+ param.tmpfile_createflag= O_RDWR | O_TRUNC;
+ param.using_global_keycache= 1;
+ param.thd= thd;
+ param.tmpdir= &mysql_tmpdir_list;
+ param.out_flag= 0;
+ strmov(fixed_name, file->s->open_file_name);
+
+ // Don't lock tables if we have used LOCK TABLE
+ if (!thd->locked_tables &&
+ maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK))
+ {
+ _ma_check_print_error(&param, ER(ER_CANT_LOCK), my_errno);
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ }
+
+ if (!do_optimize ||
+ ((file->state->del ||
+ ((file->s->data_file_type != BLOCK_RECORD) &&
+ share->state.split != file->state->records)) &&
+ (!(param.testflag & T_QUICK) ||
+ (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_OPTIMIZED_ROWS)))))
+ {
+ ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ?
+ maria_get_mask_all_keys_active(share->base.keys) :
+ share->state.key_map);
+ uint testflag= param.testflag;
+ if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) &&
+ (local_testflag & T_REP_BY_SORT))
+ {
+ local_testflag |= T_STATISTICS;
+ param.testflag |= T_STATISTICS; // We get this for free
+ statistics_done= 1;
+ /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */
+ if (thd->variables.maria_repair_threads > 1 &&
+ file->s->data_file_type != BLOCK_RECORD)
+ {
+ char buf[40];
+ /* TODO: respect maria_repair_threads variable */
+ my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map));
+ thd->proc_info= buf;
+ error= maria_repair_parallel(&param, file, fixed_name,
+ param.testflag & T_QUICK);
+ thd->proc_info= "Repair done"; // to reset proc_info, as
+ // it was pointing to local buffer
+ }
+ else
+ {
+ thd->proc_info= "Repair by sorting";
+ error= maria_repair_by_sort(&param, file, fixed_name,
+ param.testflag & T_QUICK);
+ }
+ }
+ else
+ {
+ thd->proc_info= "Repair with keycache";
+ param.testflag &= ~T_REP_BY_SORT;
+ error= maria_repair(&param, file, fixed_name, param.testflag & T_QUICK);
+ }
+ param.testflag= testflag;
+ optimize_done= 1;
+ }
+ if (!error)
+ {
+ if ((local_testflag & T_SORT_INDEX) &&
+ (share->state.changed & STATE_NOT_SORTED_PAGES))
+ {
+ optimize_done= 1;
+ thd->proc_info= "Sorting index";
+ error= maria_sort_index(&param, file, fixed_name);
+ }
+ if (!statistics_done && (local_testflag & T_STATISTICS))
+ {
+ if (share->state.changed & STATE_NOT_ANALYZED)
+ {
+ optimize_done= 1;
+ thd->proc_info= "Analyzing";
+ error= maria_chk_key(&param, file);
+ }
+ else
+ local_testflag &= ~T_STATISTICS; // Don't update statistics
+ }
+ }
+ thd->proc_info= "Saving state";
+ pthread_mutex_lock(&share->intern_lock);
+ if (!error)
+ {
+ if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file))
+ {
+ share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+ /*
+ the following 'if', thought conceptually wrong,
+ is a useful optimization nevertheless.
+ */
+ if (file->state != &file->s->state.state)
+ file->s->state.state= *file->state;
+ if (file->s->base.auto_key)
+ _ma_update_auto_increment_key(&param, file, 1);
+ if (optimize_done)
+ error= maria_update_state_info(&param, file,
+ UPDATE_TIME | UPDATE_OPEN_COUNT |
+ (local_testflag &
+ T_STATISTICS ? UPDATE_STAT : 0));
+ info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+ HA_STATUS_CONST);
+ if (rows != file->state->records && !(param.testflag & T_VERY_SILENT))
+ {
+ char llbuff[22], llbuff2[22];
+ _ma_check_print_warning(&param, "Number of rows changed from %s to %s",
+ llstr(rows, llbuff),
+ llstr(file->state->records, llbuff2));
+ }
+ }
+ else
+ {
+ maria_mark_crashed_on_repair(file);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ maria_update_state_info(&param, file, 0);
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ thd->proc_info= old_proc_info;
+ if (!thd->locked_tables)
+ {
+ _ma_reenable_logging_for_table(file->s);
+ maria_lock_database(file, F_UNLCK);
+ }
+ DBUG_RETURN(error ? HA_ADMIN_FAILED :
+ !optimize_done ? HA_ADMIN_ALREADY_DONE : HA_ADMIN_OK);
+}
+
+
+/*
+ Assign table indexes to a specific key cache.
+*/
+
+int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ PAGECACHE *new_pagecache= check_opt->pagecache;
+ const char *errmsg= 0;
+ int error= HA_ADMIN_OK;
+ ulonglong map;
+ TABLE_LIST *table_list= table->pos_in_table_list;
+ DBUG_ENTER("ha_maria::assign_to_keycache");
+
+ /* for now, it is disabled */
+ DBUG_RETURN(HA_ADMIN_NOT_IMPLEMENTED);
+
+ table->keys_in_use_for_query.clear_all();
+
+ if (table_list->process_index_hints(table))
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ map= ~(ulonglong) 0;
+ if (!table->keys_in_use_for_query.is_clear_all())
+ /* use all keys if there's no list specified by the user through hints */
+ map= table->keys_in_use_for_query.to_ulonglong();
+
+ if ((error= maria_assign_to_pagecache(file, map, new_pagecache)))
+ {
+ char buf[STRING_BUFFER_USUAL_SIZE];
+ my_snprintf(buf, sizeof(buf),
+ "Failed to flush to index file (errno: %d)", error);
+ errmsg= buf;
+ error= HA_ADMIN_CORRUPT;
+ }
+
+ if (error != HA_ADMIN_OK)
+ {
+ /* Send error to user */
+ HA_CHECK param;
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "assign_to_keycache";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Preload pages of the index file for a table into the key cache.
+*/
+
+int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ const char *errmsg;
+ ulonglong map;
+ TABLE_LIST *table_list= table->pos_in_table_list;
+ my_bool ignore_leaves= table_list->ignore_leaves;
+ char buf[ERRMSGSIZE+20];
+
+ DBUG_ENTER("ha_maria::preload_keys");
+
+ table->keys_in_use_for_query.clear_all();
+
+ if (table_list->process_index_hints(table))
+ DBUG_RETURN(HA_ADMIN_FAILED);
+
+ map= ~(ulonglong) 0;
+ /* Check validity of the index references */
+ if (!table->keys_in_use_for_query.is_clear_all())
+ /* use all keys if there's no list specified by the user through hints */
+ map= table->keys_in_use_for_query.to_ulonglong();
+
+ maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE,
+ (void*) &thd->variables.preload_buff_size);
+
+ if ((error= maria_preload(file, map, ignore_leaves)))
+ {
+ switch (error) {
+ case HA_ERR_NON_UNIQUE_BLOCK_SIZE:
+ errmsg= "Indexes use different block sizes";
+ break;
+ case HA_ERR_OUT_OF_MEM:
+ errmsg= "Failed to allocate buffer";
+ break;
+ default:
+ my_snprintf(buf, ERRMSGSIZE,
+ "Failed to read from index file (errno: %d)", my_errno);
+ errmsg= buf;
+ }
+ error= HA_ADMIN_FAILED;
+ goto err;
+ }
+
+ DBUG_RETURN(HA_ADMIN_OK);
+
+err:
+ {
+ HA_CHECK param;
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "preload_keys";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg);
+ DBUG_RETURN(error);
+ }
+}
+
+
+/*
+ Disable indexes, making it persistent if requested.
+
+ SYNOPSIS
+ disable_indexes()
+ mode mode of operation:
+ HA_KEY_SWITCH_NONUNIQ disable all non-unique keys
+ HA_KEY_SWITCH_ALL disable all keys
+ HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent
+ HA_KEY_SWITCH_ALL_SAVE dis. all keys and make persistent
+
+ IMPLEMENTATION
+ HA_KEY_SWITCH_NONUNIQ is not implemented.
+ HA_KEY_SWITCH_ALL_SAVE is not implemented.
+
+ RETURN
+ 0 ok
+ HA_ERR_WRONG_COMMAND mode not implemented.
+*/
+
+int ha_maria::disable_indexes(uint mode)
+{
+ int error;
+
+ if (mode == HA_KEY_SWITCH_ALL)
+ {
+ /* call a storage engine function to switch the key map */
+ error= maria_disable_indexes(file);
+ }
+ else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+ {
+ maria_extra(file, HA_EXTRA_NO_KEYS, 0);
+ info(HA_STATUS_CONST); // Read new key info
+ error= 0;
+ }
+ else
+ {
+ /* mode not implemented */
+ error= HA_ERR_WRONG_COMMAND;
+ }
+ return error;
+}
+
+
+/*
+ Enable indexes, making it persistent if requested.
+
+ SYNOPSIS
+ enable_indexes()
+ mode mode of operation:
+ HA_KEY_SWITCH_NONUNIQ enable all non-unique keys
+ HA_KEY_SWITCH_ALL enable all keys
+ HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent
+ HA_KEY_SWITCH_ALL_SAVE en. all keys and make persistent
+
+ DESCRIPTION
+ Enable indexes, which might have been disabled by disable_index() before.
+ The modes without _SAVE work only if both data and indexes are empty,
+ since the MARIA repair would enable them persistently.
+ To be sure in these cases, call handler::delete_all_rows() before.
+
+ IMPLEMENTATION
+ HA_KEY_SWITCH_NONUNIQ is not implemented.
+ HA_KEY_SWITCH_ALL_SAVE is not implemented.
+
+ RETURN
+ 0 ok
+ !=0 Error, among others:
+ HA_ERR_CRASHED data or index is non-empty. Delete all rows and retry.
+ HA_ERR_WRONG_COMMAND mode not implemented.
+*/
+
+int ha_maria::enable_indexes(uint mode)
+{
+ int error;
+
+ if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys))
+ {
+ /* All indexes are enabled already. */
+ return 0;
+ }
+
+ if (mode == HA_KEY_SWITCH_ALL)
+ {
+ error= maria_enable_indexes(file);
+ /*
+ Do not try to repair on error,
+ as this could make the enabled state persistent,
+ but mode==HA_KEY_SWITCH_ALL forbids it.
+ */
+ }
+ else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+ {
+ THD *thd= current_thd;
+ HA_CHECK param;
+ const char *save_proc_info= thd->proc_info;
+ thd->proc_info= "Creating index";
+ maria_chk_init(&param);
+ param.op_name= "recreating_index";
+ param.testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK |
+ T_CREATE_MISSING_KEYS);
+ param.myf_rw &= ~MY_WAIT_IF_FULL;
+ param.sort_buffer_length= thd->variables.maria_sort_buff_size;
+ param.stats_method=
+ (enum_handler_stats_method) thd->variables.maria_stats_method;
+ param.tmpdir= &mysql_tmpdir_list;
+ if ((error= (repair(thd, param, 0) != HA_ADMIN_OK)) && param.retry_repair)
+ {
+ sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, retrying",
+ my_errno, param.db_name, param.table_name);
+ /* Repairing by sort failed. Now try standard repair method. */
+ param.testflag &= ~(T_REP_BY_SORT | T_QUICK);
+ error= (repair(thd, param, 0) != HA_ADMIN_OK);
+ /*
+ If the standard repair succeeded, clear all error messages which
+ might have been set by the first repair. They can still be seen
+ with SHOW WARNINGS then.
+ */
+#ifndef EMBEDDED_LIBRARY
+ if (!error)
+ thd->clear_error();
+#endif /* EMBEDDED_LIBRARY */
+ }
+ info(HA_STATUS_CONST);
+ thd->proc_info= save_proc_info;
+ }
+ else
+ {
+ /* mode not implemented */
+ error= HA_ERR_WRONG_COMMAND;
+ }
+ return error;
+}
+
+
+/*
+ Test if indexes are disabled.
+
+
+ SYNOPSIS
+ indexes_are_disabled()
+ no parameters
+
+
+ RETURN
+ 0 indexes are not disabled
+ 1 all indexes are disabled
+ [2 non-unique indexes are disabled - NOT YET IMPLEMENTED]
+*/
+
+int ha_maria::indexes_are_disabled(void)
+{
+ return maria_indexes_are_disabled(file);
+}
+
+
+/*
+ prepare for a many-rows insert operation
+ e.g. - disable indexes (if they can be recreated fast) or
+ activate special bulk-insert optimizations
+
+ SYNOPSIS
+ start_bulk_insert(rows)
+ rows Rows to be inserted
+ 0 if we don't know
+
+ NOTICE
+ Do not forget to call end_bulk_insert() later!
+*/
+
+void ha_maria::start_bulk_insert(ha_rows rows)
+{
+ DBUG_ENTER("ha_maria::start_bulk_insert");
+ THD *thd= current_thd;
+ ulong size= min(thd->variables.read_buff_size,
+ table->s->avg_row_length * rows);
+ DBUG_PRINT("info", ("start_bulk_insert: rows %lu size %lu",
+ (ulong) rows, size));
+
+ /* don't enable row cache if too few rows */
+ if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE))
+ maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size);
+
+ can_enable_indexes= (maria_is_all_keys_active(file->s->state.key_map,
+ file->s->base.keys));
+
+ if (!(specialflag & SPECIAL_SAFE_MODE))
+ {
+ /*
+ Only disable old index if the table was empty and we are inserting
+ a lot of rows.
+ We should not do this for only a few rows as this is slower and
+ we don't want to update the key statistics based of only a few rows.
+ */
+ if (file->state->records == 0 && can_enable_indexes &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES))
+ maria_disable_non_unique_index(file, rows);
+ else if (!file->bulk_insert &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT))
+ {
+ maria_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows);
+ }
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ end special bulk-insert optimizations,
+ which have been activated by start_bulk_insert().
+
+ SYNOPSIS
+ end_bulk_insert()
+ no arguments
+
+ RETURN
+ 0 OK
+ != 0 Error
+*/
+
+int ha_maria::end_bulk_insert()
+{
+ int err;
+ DBUG_ENTER("ha_maria::end_bulk_insert");
+ maria_end_bulk_insert(file);
+ err= maria_extra(file, HA_EXTRA_NO_CACHE, 0);
+ DBUG_RETURN(err ? err : can_enable_indexes ?
+ enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE) : 0);
+}
+
+
+bool ha_maria::check_and_repair(THD *thd)
+{
+ int error= 0;
+ int marked_crashed;
+ char *old_query;
+ uint old_query_length;
+ HA_CHECK_OPT check_opt;
+ DBUG_ENTER("ha_maria::check_and_repair");
+
+ check_opt.init();
+ check_opt.flags= T_MEDIUM | T_AUTO_REPAIR;
+ // Don't use quick if deleted rows
+ if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK))
+ check_opt.flags |= T_QUICK;
+ sql_print_warning("Checking table: '%s'", table->s->path.str);
+
+ old_query= thd->query;
+ old_query_length= thd->query_length;
+ pthread_mutex_lock(&LOCK_thread_count);
+ thd->query= table->s->table_name.str;
+ thd->query_length= table->s->table_name.length;
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ if ((marked_crashed= maria_is_crashed(file)) || check(thd, &check_opt))
+ {
+ sql_print_warning("Recovering table: '%s'", table->s->path.str);
+ check_opt.flags=
+ ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) |
+ (marked_crashed ? 0 : T_QUICK) |
+ (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) |
+ T_AUTO_REPAIR);
+ if (repair(thd, &check_opt))
+ error= 1;
+ }
+ pthread_mutex_lock(&LOCK_thread_count);
+ thd->query= old_query;
+ thd->query_length= old_query_length;
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_RETURN(error);
+}
+
+
+bool ha_maria::is_crashed() const
+{
+ return (file->s->state.changed & STATE_CRASHED ||
+ (my_disable_locking && file->s->state.open_count));
+}
+
+
+int ha_maria::update_row(const uchar * old_data, uchar * new_data)
+{
+ ha_statistic_increment(&SSV::ha_update_count);
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+ return maria_update(file, old_data, new_data);
+}
+
+
+int ha_maria::delete_row(const uchar * buf)
+{
+ ha_statistic_increment(&SSV::ha_delete_count);
+ return maria_delete(file, buf);
+}
+
+
+int ha_maria::index_read(uchar * buf, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_read_idx(uchar * buf, uint index, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, index, key, keypart_map, find_flag);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_read_last(uchar * buf, const uchar * key,
+ key_part_map keypart_map)
+{
+ DBUG_ENTER("ha_maria::index_read_last");
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, active_index, key, keypart_map,
+ HA_READ_PREFIX_LAST);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ DBUG_RETURN(error);
+}
+
+
+int ha_maria::index_next(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_next_count);
+ int error= maria_rnext(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_prev(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_prev_count);
+ int error= maria_rprev(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_first(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_first_count);
+ int error= maria_rfirst(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_last(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_last_count);
+ int error= maria_rlast(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_next_same(uchar * buf,
+ const uchar *key __attribute__ ((unused)),
+ uint length __attribute__ ((unused)))
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_next_count);
+ int error= maria_rnext_same(file, buf);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::rnd_init(bool scan)
+{
+ if (scan)
+ return maria_scan_init(file);
+ return maria_reset(file); // Free buffers
+}
+
+
+int ha_maria::rnd_end()
+{
+ /* Safe to call even if we don't have started a scan */
+ maria_scan_end(file);
+ return 0;
+}
+
+
+int ha_maria::rnd_next(uchar *buf)
+{
+ ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+ int error= maria_scan(file, buf);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::restart_rnd_next(uchar *buf, uchar *pos)
+{
+ return rnd_pos(buf, pos);
+}
+
+
+int ha_maria::rnd_pos(uchar * buf, uchar *pos)
+{
+ ha_statistic_increment(&SSV::ha_read_rnd_count);
+ int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length));
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+void ha_maria::position(const uchar * record)
+{
+ my_off_t row_position= maria_position(file);
+ my_store_ptr(ref, ref_length, row_position);
+}
+
+
+int ha_maria::info(uint flag)
+{
+ MARIA_INFO maria_info;
+ char name_buff[FN_REFLEN];
+
+ (void) maria_status(file, &maria_info, flag);
+ if (flag & HA_STATUS_VARIABLE)
+ {
+ stats.records= maria_info.records;
+ stats.deleted= maria_info.deleted;
+ stats.data_file_length= maria_info.data_file_length;
+ stats.index_file_length= maria_info.index_file_length;
+ stats.delete_length= maria_info.delete_length;
+ stats.check_time= maria_info.check_time;
+ stats.mean_rec_length= maria_info.mean_reclength;
+ }
+ if (flag & HA_STATUS_CONST)
+ {
+ TABLE_SHARE *share= table->s;
+ stats.max_data_file_length= maria_info.max_data_file_length;
+ stats.max_index_file_length= maria_info.max_index_file_length;
+ stats.create_time= maria_info.create_time;
+ ref_length= maria_info.reflength;
+ share->db_options_in_use= maria_info.options;
+ stats.block_size= maria_block_size;
+
+ /* Update share */
+ if (share->tmp_table == NO_TMP_TABLE)
+ pthread_mutex_lock(&share->mutex);
+ share->keys_in_use.set_prefix(share->keys);
+ share->keys_in_use.intersect_extended(maria_info.key_map);
+ share->keys_for_keyread.intersect(share->keys_in_use);
+ share->db_record_offset= maria_info.record_offset;
+ if (share->key_parts)
+ memcpy((char*) table->key_info[0].rec_per_key,
+ (char*) maria_info.rec_per_key,
+ sizeof(table->key_info[0].rec_per_key) * share->key_parts);
+ if (share->tmp_table == NO_TMP_TABLE)
+ pthread_mutex_unlock(&share->mutex);
+
+ /*
+ Set data_file_name and index_file_name to point at the symlink value
+ if table is symlinked (Ie; Real name is not same as generated name)
+ */
+ data_file_name= index_file_name= 0;
+ fn_format(name_buff, file->s->open_file_name, "", MARIA_NAME_DEXT,
+ MY_APPEND_EXT | MY_UNPACK_FILENAME);
+ if (strcmp(name_buff, maria_info.data_file_name))
+ data_file_name=maria_info.data_file_name;
+ fn_format(name_buff, file->s->open_file_name, "", MARIA_NAME_IEXT,
+ MY_APPEND_EXT | MY_UNPACK_FILENAME);
+ if (strcmp(name_buff, maria_info.index_file_name))
+ index_file_name=maria_info.index_file_name;
+ }
+ if (flag & HA_STATUS_ERRKEY)
+ {
+ errkey= maria_info.errkey;
+ my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos);
+ }
+ /* Faster to always update, than to do it based on flag */
+ stats.update_time= maria_info.update_time;
+ stats.auto_increment_value= maria_info.auto_increment;
+
+ return 0;
+}
+
+
+int ha_maria::extra(enum ha_extra_function operation)
+{
+ if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD)
+ return 0;
+ return maria_extra(file, operation, 0);
+}
+
+int ha_maria::reset(void)
+{
+ return maria_reset(file);
+}
+
+/* To be used with WRITE_CACHE and EXTRA_CACHE */
+
+int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size)
+{
+ if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE)
+ return 0;
+ return maria_extra(file, operation, (void*) &cache_size);
+}
+
+
+int ha_maria::delete_all_rows()
+{
+ return maria_delete_all_rows(file);
+}
+
+
+int ha_maria::delete_table(const char *name)
+{
+ return maria_delete_table(name);
+}
+
+#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
+
+int ha_maria::external_lock(THD *thd, int lock_type)
+{
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("ha_maria::external_lock");
+ /*
+ We don't test now_transactional because it may vary between lock/unlock
+ and thus confuse our reference counting.
+ It is critical to skip non-transactional tables: user-visible temporary
+ tables get an external_lock() when read/written for the first time, but no
+ corresponding unlock (they just stay locked and are later dropped while
+ locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp"
+ would never commit as its "locked_tables" count would stay 1.
+ When Maria has has_transactions()==TRUE, open_temporary_table()
+ (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the
+ external_lock(F_UNLCK) will happen and we can then allow the user to
+ create transactional temporary tables.
+ */
+ if (!file->s->base.born_transactional)
+ goto skip_transaction;
+ if (lock_type != F_UNLCK)
+ {
+ if (!thd->transaction.on)
+ {
+ /*
+ No need to log REDOs/UNDOs. If this is an internal temporary table
+ which will be renamed to a permanent table (like in ALTER TABLE),
+ the rename happens after unlocking so will be durable (and the table
+ will get its create_rename_lsn).
+ Note: if we wanted to enable users to have an old backup and apply
+ tons of archived logs to roll-forward, we could then not disable
+ REDOs/UNDOs in this case.
+ */
+ DBUG_PRINT("info", ("Disabling logging for table"));
+ _ma_tmp_disable_logging_for_table(file->s);
+ }
+ if (!trn) /* no transaction yet - open it now */
+ {
+ trn= trnman_new_trn(& thd->mysys_var->mutex,
+ & thd->mysys_var->suspend,
+ thd->thread_stack + STACK_DIRECTION *
+ (my_thread_stack_size - STACK_MIN_SIZE));
+ if (unlikely(!trn))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+ DBUG_PRINT("info", ("THD_TRN set to 0x%lx", (ulong)trn));
+ THD_TRN= trn;
+ if (thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ trans_register_ha(thd, TRUE, maria_hton);
+ }
+ this->file->trn= trn;
+ if (!trnman_increment_locked_tables(trn))
+ {
+ trans_register_ha(thd, FALSE, maria_hton);
+ trnman_new_statement(trn);
+ }
+ }
+ else
+ {
+ _ma_reenable_logging_for_table(file->s);
+ this->file->trn= 0; /* TODO: remove it also in commit and rollback */
+ if (trn && trnman_has_locked_tables(trn))
+ {
+ if (!trnman_decrement_locked_tables(trn))
+ {
+ /* autocommit ? rollback a transaction */
+#ifdef MARIA_CANNOT_ROLLBACK
+ if (ma_commit(trn))
+ DBUG_RETURN(1);
+ THD_TRN= 0;
+#else
+ if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+ {
+ trnman_rollback_trn(trn);
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ }
+#endif
+ }
+ }
+ }
+skip_transaction:
+ DBUG_RETURN(maria_lock_database(file, !table->s->tmp_table ?
+ lock_type : ((lock_type == F_UNLCK) ?
+ F_UNLCK : F_EXTRA_LCK)));
+}
+
+int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type)
+{
+ TRN *trn= THD_TRN;
+ if (file->s->base.born_transactional)
+ {
+ DBUG_ASSERT(trn); // this may be called only after external_lock()
+ DBUG_ASSERT(trnman_has_locked_tables(trn));
+ DBUG_ASSERT(lock_type != F_UNLCK);
+ /*
+ As external_lock() was already called, don't increment locked_tables.
+ Note that we call the function below possibly several times when
+ statement starts (once per table). This is ok as long as that function
+ does cheap operations. Otherwise, we will need to do it only on first
+ call to start_stmt().
+ */
+ trnman_new_statement(trn);
+ }
+ return 0;
+}
+
+THR_LOCK_DATA **ha_maria::store_lock(THD *thd,
+ THR_LOCK_DATA **to,
+ enum thr_lock_type lock_type)
+{
+ if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK)
+ file->lock.type= lock_type;
+ *to++= &file->lock;
+ return to;
+}
+
+
+void ha_maria::update_create_info(HA_CREATE_INFO *create_info)
+{
+ ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST);
+ if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
+ {
+ create_info->auto_increment_value= stats.auto_increment_value;
+ }
+ create_info->data_file_name= data_file_name;
+ create_info->index_file_name= index_file_name;
+}
+
+
+enum row_type ha_maria::get_row_type() const
+{
+ switch (file->s->data_file_type) {
+ case STATIC_RECORD: return ROW_TYPE_FIXED;
+ case DYNAMIC_RECORD: return ROW_TYPE_DYNAMIC;
+ case BLOCK_RECORD: return ROW_TYPE_PAGE;
+ case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED;
+ default: return ROW_TYPE_NOT_USED;
+ }
+}
+
+
+static enum data_file_type maria_row_type(HA_CREATE_INFO *info)
+{
+ if (info->transactional == HA_CHOICE_YES)
+ return BLOCK_RECORD;
+ switch (info->row_type) {
+ case ROW_TYPE_FIXED: return STATIC_RECORD;
+ case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD;
+ default: return BLOCK_RECORD;
+ }
+}
+
+
+int ha_maria::create(const char *name, register TABLE *table_arg,
+ HA_CREATE_INFO *ha_create_info)
+{
+ int error;
+ uint create_flags= 0, records, i;
+ char buff[FN_REFLEN];
+ MARIA_KEYDEF *keydef;
+ MARIA_COLUMNDEF *recinfo;
+ MARIA_CREATE_INFO create_info;
+ TABLE_SHARE *share= table_arg->s;
+ uint options= share->db_options_in_use;
+ enum data_file_type row_type;
+ DBUG_ENTER("ha_maria::create");
+
+ for (i= 0; i < share->keys; i++)
+ {
+ if (table_arg->key_info[i].flags & HA_USES_PARSER)
+ {
+ create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER;
+ break;
+ }
+ }
+ row_type= maria_row_type(ha_create_info);
+ if ((error= table2maria(table_arg, &keydef, &recinfo, &records)))
+ DBUG_RETURN(error); /* purecov: inspected */
+ bzero((char*) &create_info, sizeof(create_info));
+ create_info.max_rows= share->max_rows;
+ create_info.reloc_rows= share->min_rows;
+ create_info.with_auto_increment= share->next_number_key_offset == 0;
+ create_info.auto_increment= (ha_create_info->auto_increment_value ?
+ ha_create_info->auto_increment_value -1 :
+ (ulonglong) 0);
+ create_info.data_file_length= ((ulonglong) share->max_rows *
+ share->avg_row_length);
+ create_info.data_file_name= ha_create_info->data_file_name;
+ create_info.index_file_name= ha_create_info->index_file_name;
+#ifdef ASK_MONTY
+ /**
+ @todo ASK_MONTY
+ Where "transactional" in the frm and in the engine can go out of sync.
+ Don't we want to do, after the setting, this test:
+ if (!create_info.transactional &&
+ ha_create_info->transactional == HA_CHOICE_YES)
+ error;
+ ?
+ Why fool the user?
+ */
+#endif
+ create_info.transactional= (row_type == BLOCK_RECORD &&
+ ha_create_info->transactional != HA_CHOICE_NO);
+
+ if (ha_create_info->options & HA_LEX_CREATE_TMP_TABLE)
+ create_flags|= HA_CREATE_TMP_TABLE;
+ if (options & HA_OPTION_PACK_RECORD)
+ create_flags|= HA_PACK_RECORD;
+ if (options & HA_OPTION_CHECKSUM)
+ create_flags|= HA_CREATE_CHECKSUM;
+ if (options & HA_OPTION_DELAY_KEY_WRITE)
+ create_flags|= HA_CREATE_DELAY_KEY_WRITE;
+
+ /* TODO: Check that the following fn_format is really needed */
+ error=
+ maria_create(fn_format(buff, name, "", "",
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ row_type, share->keys, keydef,
+ records, recinfo,
+ 0, (MARIA_UNIQUEDEF *) 0,
+ &create_info, create_flags);
+
+ my_free((uchar*) recinfo, MYF(0));
+ DBUG_RETURN(error);
+}
+
+
+int ha_maria::rename_table(const char *from, const char *to)
+{
+ return maria_rename(from, to);
+}
+
+
+void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values)
+{
+ ulonglong nr;
+ int error;
+ uchar key[HA_MAX_KEY_LENGTH];
+
+ if (!table->s->next_number_key_offset)
+ { // Autoincrement at key-start
+ ha_maria::info(HA_STATUS_AUTO);
+ *first_value= stats.auto_increment_value;
+ /* Maria has only table-level lock for now, so reserves to +inf */
+ *nb_reserved_values= ULONGLONG_MAX;
+ return;
+ }
+
+ /* it's safe to call the following if bulk_insert isn't on */
+ maria_flush_bulk_insert(file, table->s->next_number_index);
+
+ (void) extra(HA_EXTRA_KEYREAD);
+ key_copy(key, table->record[0],
+ table->key_info + table->s->next_number_index,
+ table->s->next_number_key_offset);
+ error= maria_rkey(file, table->record[1], (int) table->s->next_number_index,
+ key, make_prev_keypart_map(table->s->next_number_keypart),
+ HA_READ_PREFIX_LAST);
+ if (error)
+ nr= 1;
+ else
+ {
+ /* Get data from record[1] */
+ nr= ((ulonglong) table->next_number_field->
+ val_int_offset(table->s->rec_buff_length) + 1);
+ }
+ extra(HA_EXTRA_NO_KEYREAD);
+ *first_value= nr;
+ /*
+ MySQL needs to call us for next row: assume we are inserting ("a",null)
+ here, we return 3, and next this statement will want to insert ("b",null):
+ there is no reason why ("b",3+1) would be the good row to insert: maybe it
+ already exists, maybe 3+1 is too large...
+ */
+ *nb_reserved_values= 1;
+}
+
+
+/*
+ Find out how many rows there is in the given range
+
+ SYNOPSIS
+ records_in_range()
+ inx Index to use
+ min_key Start of range. Null pointer if from first key
+ max_key End of range. Null pointer if to last key
+
+ NOTES
+ min_key.flag can have one of the following values:
+ HA_READ_KEY_EXACT Include the key in the range
+ HA_READ_AFTER_KEY Don't include key in range
+
+ max_key.flag can have one of the following values:
+ HA_READ_BEFORE_KEY Don't include key in range
+ HA_READ_AFTER_KEY Include all 'end_key' values in the range
+
+ RETURN
+ HA_POS_ERROR Something is wrong with the index tree.
+ 0 There is no matching keys in the given range
+ number > 0 There is approximately 'number' matching rows in
+ the range.
+*/
+
+ha_rows ha_maria::records_in_range(uint inx, key_range *min_key,
+ key_range *max_key)
+{
+ return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key);
+}
+
+
+int ha_maria::ft_read(uchar * buf)
+{
+ int error;
+
+ if (!ft_handler)
+ return -1;
+
+ thread_safe_increment(table->in_use->status_var.ha_read_next_count,
+ &LOCK_status); // why ?
+
+ error= ft_handler->please->read_next(ft_handler, (char*) buf);
+
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+uint ha_maria::checksum() const
+{
+ return (uint) file->state->checksum;
+}
+
+
+bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *info,
+ uint table_changes)
+{
+ uint options= table->s->db_options_in_use;
+
+ if (info->auto_increment_value != stats.auto_increment_value ||
+ info->data_file_name != data_file_name ||
+ info->index_file_name != index_file_name ||
+ maria_row_type(info) != data_file_type ||
+ table_changes == IS_EQUAL_NO ||
+ table_changes & IS_EQUAL_PACK_LENGTH) // Not implemented yet
+ return COMPATIBLE_DATA_NO;
+
+ if ((options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM |
+ HA_OPTION_DELAY_KEY_WRITE)) !=
+ (info->table_options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM |
+ HA_OPTION_DELAY_KEY_WRITE)))
+ return COMPATIBLE_DATA_NO;
+ return COMPATIBLE_DATA_YES;
+}
+
+
+static int maria_hton_panic(handlerton *hton, ha_panic_function flag)
+{
+ ma_checkpoint_execute(CHECKPOINT_FULL, FALSE); /* can't catch error */
+ return maria_panic(flag);
+}
+
+
+static int maria_commit(handlerton *hton __attribute__ ((unused)),
+ THD *thd, bool all)
+{
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("maria_commit");
+ trnman_reset_locked_tables(trn);
+ /* statement or transaction ? */
+ if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+ DBUG_RETURN(0); // end of statement
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ DBUG_RETURN(ma_commit(trn)); // end of transaction
+}
+
+
+static int maria_rollback(handlerton *hton __attribute__ ((unused)),
+ THD *thd, bool all)
+{
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("maria_rollback");
+ trnman_reset_locked_tables(trn);
+ /* statement or transaction ? */
+ if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+ {
+ trnman_rollback_statement(trn);
+ DBUG_RETURN(0); // end of statement
+ }
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ DBUG_RETURN(trnman_rollback_trn(trn) ?
+ HA_ERR_OUT_OF_MEM : 0); // end of transaction
+}
+
+
+static int ha_maria_init(void *p)
+{
+ int res;
+ maria_hton= (handlerton *)p;
+ maria_hton->state= SHOW_OPTION_YES;
+ maria_hton->db_type= DB_TYPE_MARIA;
+ maria_hton->create= maria_create_handler;
+ maria_hton->panic= maria_hton_panic;
+ maria_hton->commit= maria_commit;
+ maria_hton->rollback= maria_rollback;
+ /* TODO: decide if we support Maria being used for log tables */
+ maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
+ bzero(maria_log_pagecache, sizeof(*maria_log_pagecache));
+ maria_data_root= mysql_real_data_home;
+ res= maria_init() || ma_control_file_create_or_open() ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ MYSQL_VERSION_ID, server_id, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS) ||
+ maria_recover() ||
+ ma_checkpoint_init(FALSE) ||
+ /* One checkpoint after Recovery */
+ ma_checkpoint_execute(CHECKPOINT_FULL, FALSE);
+ maria_multi_threaded= TRUE;
+ return res;
+}
+
+
+struct st_mysql_storage_engine maria_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+mysql_declare_plugin(maria)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &maria_storage_engine,
+ "MARIA",
+ "MySQL AB",
+ "Traditional transactional MySQL tables",
+ PLUGIN_LICENSE_GPL,
+ ha_maria_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ 0x0100, /* 1.0 */
+ NULL, /* status variables */
+ NULL, /* system variables */
+ NULL /* config options */
+}
+mysql_declare_plugin_end;
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
new file mode 100644
index 00000000000..7675778ab5b
--- /dev/null
+++ b/storage/maria/ha_maria.h
@@ -0,0 +1,151 @@
+/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface /* gcc class implementation */
+#endif
+
+/* class for the the maria handler */
+
+#include <maria.h>
+
+#define HA_RECOVER_NONE 0 /* No automatic recover */
+#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */
+#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */
+#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */
+#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */
+
+extern ulong maria_sort_buffer_size;
+extern TYPELIB maria_recover_typelib;
+extern ulong maria_recover_options;
+
+class ha_maria :public handler
+{
+ MARIA_HA *file;
+ ulonglong int_table_flags;
+ char *data_file_name, *index_file_name;
+ enum data_file_type data_file_type;
+ bool can_enable_indexes;
+ int repair(THD * thd, HA_CHECK &param, bool optimize);
+
+public:
+ ha_maria(handlerton *hton, TABLE_SHARE * table_arg);
+ ~ha_maria() {}
+ handler *clone(MEM_ROOT *mem_root);
+ const char *table_type() const
+ { return "MARIA"; }
+ const char *index_type(uint key_number);
+ const char **bas_ext() const;
+ ulonglong table_flags() const
+ { return int_table_flags; }
+ ulong index_flags(uint inx, uint part, bool all_parts) const
+ {
+ return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ?
+ 0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
+ HA_READ_ORDER | HA_KEYREAD_ONLY);
+ }
+ uint max_supported_keys() const
+ { return MARIA_MAX_KEY; }
+ uint max_supported_key_length() const;
+ uint max_supported_key_part_length() const
+ { return max_supported_key_length(); }
+ enum row_type get_row_type() const;
+ uint checksum() const;
+ virtual double scan_time();
+
+ virtual bool check_if_locking_is_allowed(uint sql_command,
+ ulong type, TABLE * table,
+ uint count, uint current,
+ uint *system_count,
+ bool called_by_logger_thread);
+ int open(const char *name, int mode, uint test_if_locked);
+ int close(void);
+ int write_row(uchar * buf);
+ int update_row(const uchar * old_data, uchar * new_data);
+ int delete_row(const uchar * buf);
+ int index_read(uchar * buf, const uchar * key, key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int index_read_idx(uchar * buf, uint idx, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int index_read_last(uchar * buf, const uchar * key,
+ key_part_map keypart_map);
+ int index_next(uchar * buf);
+ int index_prev(uchar * buf);
+ int index_first(uchar * buf);
+ int index_last(uchar * buf);
+ int index_next_same(uchar * buf, const uchar * key, uint keylen);
+ int ft_init()
+ {
+ if (!ft_handler)
+ return 1;
+ ft_handler->please->reinit_search(ft_handler);
+ return 0;
+ }
+ FT_INFO *ft_init_ext(uint flags, uint inx, String * key)
+ {
+ return maria_ft_init_search(flags, file, inx,
+ (uchar *) key->ptr(), key->length(),
+ key->charset(), table->record[0]);
+ }
+ int ft_read(uchar * buf);
+ int rnd_init(bool scan);
+ int rnd_end(void);
+ int rnd_next(uchar * buf);
+ int rnd_pos(uchar * buf, uchar * pos);
+ int restart_rnd_next(uchar * buf, uchar * pos);
+ void position(const uchar * record);
+ int info(uint);
+ int extra(enum ha_extra_function operation);
+ int extra_opt(enum ha_extra_function operation, ulong cache_size);
+ int reset(void);
+ int external_lock(THD * thd, int lock_type);
+ int start_stmt(THD *thd, thr_lock_type lock_type);
+ int delete_all_rows(void);
+ int disable_indexes(uint mode);
+ int enable_indexes(uint mode);
+ int indexes_are_disabled(void);
+ void start_bulk_insert(ha_rows rows);
+ int end_bulk_insert();
+ ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
+ void update_create_info(HA_CREATE_INFO * create_info);
+ int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
+ THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to,
+ enum thr_lock_type lock_type);
+ virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values);
+ int rename_table(const char *from, const char *to);
+ int delete_table(const char *name);
+ int check(THD * thd, HA_CHECK_OPT * check_opt);
+ int analyze(THD * thd, HA_CHECK_OPT * check_opt);
+ int repair(THD * thd, HA_CHECK_OPT * check_opt);
+ bool check_and_repair(THD * thd);
+ bool is_crashed() const;
+ bool auto_repair() const
+ { return maria_recover_options != 0; }
+ int optimize(THD * thd, HA_CHECK_OPT * check_opt);
+ int restore(THD * thd, HA_CHECK_OPT * check_opt);
+ int backup(THD * thd, HA_CHECK_OPT * check_opt);
+ int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt);
+ int preload_keys(THD * thd, HA_CHECK_OPT * check_opt);
+ bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
+#ifdef HAVE_REPLICATION
+ int dump(THD * thd, int fd);
+ int net_read_dump(NET * net);
+#endif
+};
diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c
new file mode 100644
index 00000000000..8316d70bb29
--- /dev/null
+++ b/storage/maria/lockman.c
@@ -0,0 +1,786 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: TODO instant duration locks */
+/* QQ: #warning automatically place S instead of LS if possible */
+
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Generic Lock Manager
+
+ Lock manager handles locks on "resources", a resource must be uniquely
+ identified by a 64-bit number. Lock manager itself does not imply
+ anything about the nature of a resource - it can be a row, a table, a
+ database, or just anything.
+
+ Locks belong to "lock owners". A Lock owner is uniquely identified by a
+ 16-bit number. A function loid2lo must be provided by the application
+ that takes such a number as an argument and returns a LOCK_OWNER
+ structure.
+
+ Lock levels are completely defined by three tables. Lock compatibility
+ matrix specifies which locks can be held at the same time on a resource.
+ Lock combining matrix specifies what lock level has the same behaviour as
+ a pair of two locks of given levels. getlock_result matrix simplifies
+ intention locking and lock escalation for an application, basically it
+ defines which locks are intention locks and which locks are "loose"
+ locks. It is only used to provide better diagnostics for the
+ application, lock manager itself does not differentiate between normal,
+ intention, and loose locks.
+
+ Internally lock manager is based on a lock-free hash, see lf_hash.c for
+ details. All locks are stored in a hash, with a resource id as a search
+ key, so all locks for the same resource will be considered collisions and
+ will be put in a one (lock-free) linked list. The main lock-handling
+ logic is in the inner loop that searches for a lock in such a linked
+ list - lockfind().
+
+ This works as follows. Locks generally are added to the end of the list
+ (with one exception, see below). When scanning the list it is always
+ possible to determine what locks are granted (active) and what locks are
+ waiting - first lock is obviously active, the second is active if it's
+ compatible with the first, and so on, a lock is active if it's compatible
+ with all previous locks and all locks before it are also active.
+ To calculate the "compatible with all previous locks" all locks are
+ accumulated in prev_lock variable using lock_combining_matrix.
+
+ Lock upgrades: when a thread that has a lock on a given resource,
+ requests a new lock on the same resource and the old lock is not enough
+ to satisfy new lock requirements (which is defined by
+ lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is
+ placed in the list. Depending on other locks it is immediately active or
+ it will wait for other locks. Here's an exception to "locks are added
+ to the end" rule - upgraded locks are added after the last active lock
+ but before all waiting locks. Old lock (the one we upgraded from) is
+ not removed from the list, indeed it may be needed if the new lock was
+ in a savepoint that gets rolled back. So old lock is marked as "ignored"
+ (IGNORE_ME flag). New lock gets an UPGRADED flag.
+
+ Loose locks add an important exception to the above. Loose locks do not
+ always commute with other locks. In the list IX-LS both locks are active,
+ while in the LS-IX list only the first lock is active. This creates a
+ problem in lock upgrades. If the list was IX-LS and the owner of the
+ first lock wants to place LS lock (which can be immediately granted), the
+ IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which,
+ according to the lock compatibility matrix means that the last lock is
+ waiting - of course it all happened because IX and LS were swapped and
+ they don't commute. To work around this there's ACTIVE flag which is set
+ in every lock that never waited (was placed active), and this flag
+ overrides "compatible with all previous locks" rule.
+
+ When a lock is placed to the end of the list it's either compatible with
+ all locks and all locks are active - new lock becomes active at once, or
+ it conflicts with some of the locks, in this case in the 'blocker'
+ variable a conflicting lock is returned and the calling thread waits on a
+ pthread condition in the LOCK_OWNER structure of the owner of the
+ conflicting lock. Or a new lock is compatible with all locks, but some
+ existing locks are not compatible with each other (example: request IS,
+ when the list is S-IX) - that is not all locks are active. In this case a
+ first waiting lock is returned in the 'blocker' variable, lockman_getlock()
+ notices that a "blocker" does not conflict with the requested lock, and
+ "dereferences" it, to find the lock that it's waiting on. The calling
+ thread than begins to wait on the same lock.
+
+ To better support table-row relations where one needs to lock the table
+ with an intention lock before locking the row, extended diagnostics is
+ provided. When an intention lock (presumably on a table) is granted,
+ lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+ perhaps the thread already has a normal lock on this table),
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+ whether it's possible to lock the row, but no need to lock it - perhaps
+ the thread has a loose lock on this table). This is defined by
+ getlock_result[] table.
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_bit.h>
+#include <lf.h>
+#include "lockman.h"
+
+/*
+ Lock compatibility matrix.
+
+ It's asymmetric. Read it as "Somebody has the lock <value in the row
+ label>, can I set the lock <value in the column label> ?"
+
+ ') Though you can take LS lock while somebody has S lock, it makes no
+ sense - it's simpler to take S lock too.
+
+ 1 - compatible
+ 0 - incompatible
+ -1 - "impossible", so that we can assert the impossibility.
+*/
+static int lock_compatibility_matrix[10][10]=
+{ /* N S X IS IX SIX LS LX SLX LSIX */
+ { -1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */
+ { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */
+ { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */
+};
+
+/*
+ Lock combining matrix.
+
+ It's symmetric. Read it as "what lock level L is identical to the
+ set of two locks A and B"
+
+ One should never get N from it, we assert the impossibility
+*/
+static enum lock_type lock_combining_matrix[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { N, S, X, IS, IX, SIX, S, SLX, SLX, SIX}, /* N */
+ { S, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */
+ { X, X, X, X, X, X, X, X, X, X}, /* X */
+ { IS, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */
+ { IX, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */
+ { SIX, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */
+ { LS, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */
+ { LX, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */
+ { SLX, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */
+ { LSIX, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */
+};
+
+#define REPEAT_ONCE_MORE 0
+#define OK_TO_PLACE_THE_LOCK 1
+#define OK_TO_PLACE_THE_REQUEST 2
+#define ALREADY_HAVE_THE_LOCK 4
+#define ALREADY_HAVE_THE_REQUEST 8
+#define PLACE_NEW_DISABLE_OLD 16
+#define REQUEST_NEW_DISABLE_OLD 32
+#define RESOURCE_WAS_UNLOCKED 64
+
+#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\
+ REQUEST_NEW_DISABLE_OLD)
+#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK | ALREADY_HAVE_THE_REQUEST)
+#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD | REQUEST_NEW_DISABLE_OLD)
+
+
+/*
+ the return codes for lockman_getlock
+
+ It's asymmetric. Read it as "I have the lock <value in the row label>,
+ what value should be returned for <value in the column label> ?"
+
+ 0 means impossible combination (assert!)
+
+ Defines below help to preserve the table structure.
+ I/L/A values are self explanatory
+ x means the combination is possible (assert should not crash)
+ but it cannot happen in row locks, only in table locks (S,X),
+ or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static enum lockman_getlock_result getlock_result[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */
+ { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */
+ { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */
+ { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */
+ { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */
+ { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */
+ { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */
+ { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */
+ { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */
+ { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+LF_REQUIRE_PINS(4);
+
+typedef struct lockman_lock {
+ uint64 resource;
+ struct lockman_lock *lonext;
+ intptr volatile link;
+ uint32 hashnr;
+ /* QQ: TODO - remove hashnr from LOCK */
+ uint16 loid;
+ uchar lock; /* sizeof(uchar) <= sizeof(enum) */
+ uchar flags;
+} LOCK;
+
+#define IGNORE_ME 1
+#define UPGRADED 2
+#define ACTIVE 4
+
+typedef struct {
+ intptr volatile *prev;
+ LOCK *curr, *next;
+ LOCK *blocker, *upgrade_from;
+} CURSOR;
+
+#define PTR(V) (LOCK *)((V) & (~(intptr)1))
+#define DELETED(V) ((V) & 1)
+
+/*
+ NOTE
+ cursor is positioned in either case
+ pins[0..3] are used, they are NOT removed on return
+*/
+static int lockfind(LOCK * volatile *head, LOCK *node,
+ CURSOR *cursor, LF_PINS *pins)
+{
+ uint32 hashnr, cur_hashnr;
+ uint64 resource, cur_resource;
+ intptr link;
+ my_bool cur_active, compatible, upgrading, prev_active;
+ enum lock_type lock, prev_lock, cur_lock;
+ uint16 loid, cur_loid;
+ int cur_flags, flags;
+
+ hashnr= node->hashnr;
+ resource= node->resource;
+ lock= node->lock;
+ loid= node->loid;
+ flags= node->flags;
+
+retry:
+ cursor->prev= (intptr *)head;
+ prev_lock= N;
+ cur_active= TRUE;
+ compatible= TRUE;
+ upgrading= FALSE;
+ cursor->blocker= cursor->upgrade_from= 0;
+ _lf_unpin(pins, 3);
+ do {
+ cursor->curr= PTR(*cursor->prev);
+ _lf_pin(pins, 1, cursor->curr);
+ } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF);
+ for (;;)
+ {
+ if (!cursor->curr)
+ break;
+ do {
+ link= cursor->curr->link;
+ cursor->next= PTR(link);
+ _lf_pin(pins, 0, cursor->next);
+ } while(link != cursor->curr->link && LF_BACKOFF);
+ cur_hashnr= cursor->curr->hashnr;
+ cur_resource= cursor->curr->resource;
+ cur_lock= cursor->curr->lock;
+ cur_loid= cursor->curr->loid;
+ cur_flags= cursor->curr->flags;
+ if (*cursor->prev != (intptr)cursor->curr)
+ {
+ (void)LF_BACKOFF;
+ goto retry;
+ }
+ if (!DELETED(link))
+ {
+ if (cur_hashnr > hashnr ||
+ (cur_hashnr == hashnr && cur_resource >= resource))
+ {
+ if (cur_hashnr > hashnr || cur_resource > resource)
+ break;
+ /* ok, we have a lock for this resource */
+ DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0);
+ DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0);
+ if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME))
+ {
+ DBUG_ASSERT(cur_active);
+ if (cur_loid == loid)
+ cursor->upgrade_from= cursor->curr;
+ }
+ else
+ {
+ prev_active= cur_active;
+ if (cur_flags & ACTIVE)
+ DBUG_ASSERT(prev_active == TRUE);
+ else
+ cur_active&= lock_compatibility_matrix[prev_lock][cur_lock];
+ if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/)
+ break;
+ if (prev_active && !cur_active)
+ {
+ cursor->blocker= cursor->curr;
+ _lf_pin(pins, 3, cursor->curr);
+ }
+ if (cur_loid == loid)
+ {
+ /* we already have a lock on this resource */
+ DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N);
+ DBUG_ASSERT(!upgrading || (flags & IGNORE_ME));
+ if (lock_combining_matrix[cur_lock][lock] == cur_lock)
+ {
+ /* new lock is compatible */
+ if (cur_active)
+ {
+ cursor->blocker= cursor->curr; /* loose-locks! */
+ _lf_unpin(pins, 3); /* loose-locks! */
+ return ALREADY_HAVE_THE_LOCK;
+ }
+ else
+ return ALREADY_HAVE_THE_REQUEST;
+ }
+ /* not compatible, upgrading */
+ upgrading= TRUE;
+ cursor->upgrade_from= cursor->curr;
+ }
+ else
+ {
+ if (!lock_compatibility_matrix[cur_lock][lock])
+ {
+ compatible= FALSE;
+ cursor->blocker= cursor->curr;
+ _lf_pin(pins, 3, cursor->curr);
+ }
+ }
+ prev_lock= lock_combining_matrix[prev_lock][cur_lock];
+ DBUG_ASSERT(prev_lock != N);
+ }
+ }
+ cursor->prev= &(cursor->curr->link);
+ _lf_pin(pins, 2, cursor->curr);
+ }
+ else
+ {
+ if (my_atomic_casptr((void **)cursor->prev,
+ (void **)&cursor->curr, cursor->next))
+ _lf_alloc_free(pins, cursor->curr);
+ else
+ {
+ (void)LF_BACKOFF;
+ goto retry;
+ }
+ }
+ cursor->curr= cursor->next;
+ _lf_pin(pins, 1, cursor->curr);
+ }
+ /*
+ either the end of lock list - no more locks for this resource,
+ or upgrading and the end of active lock list
+ */
+ if (upgrading)
+ {
+ if (compatible /*&& prev_active*/)
+ return PLACE_NEW_DISABLE_OLD;
+ else
+ return REQUEST_NEW_DISABLE_OLD;
+ }
+ if (cur_active && compatible)
+ {
+ /*
+ either no locks for this resource or all are compatible.
+ ok to place the lock in any case.
+ */
+ return prev_lock == N ? RESOURCE_WAS_UNLOCKED
+ : OK_TO_PLACE_THE_LOCK;
+ }
+ /* we have a lock conflict. ok to place a lock request. And wait */
+ return OK_TO_PLACE_THE_REQUEST;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+ LOCK **blocker)
+{
+ CURSOR cursor;
+ int res;
+
+ do
+ {
+ res= lockfind(head, node, &cursor, pins);
+ DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST);
+ if (!(res & ALREADY_HAVE))
+ {
+ if (res & LOCK_UPGRADE)
+ {
+ node->flags|= UPGRADED;
+ node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock];
+ }
+ if (!(res & NEED_TO_WAIT))
+ node->flags|= ACTIVE;
+ node->link= (intptr)cursor.curr;
+ DBUG_ASSERT(node->link != (intptr)node);
+ DBUG_ASSERT(cursor.prev != &node->link);
+ if (!my_atomic_casptr((void **)cursor.prev, (void **)&cursor.curr, node))
+ {
+ res= REPEAT_ONCE_MORE;
+ node->flags&= ~ACTIVE;
+ }
+ if (res & LOCK_UPGRADE)
+ cursor.upgrade_from->flags|= IGNORE_ME;
+ /*
+ QQ: is this OK ? if a reader has already read upgrade_from,
+ it may find it conflicting with node :(
+ - see the last test from test_lockman_simple()
+ */
+ }
+
+ } while (res == REPEAT_ONCE_MORE);
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ /*
+ note that blocker is not necessarily pinned here (when it's == curr).
+ this is ok as in such a case it's either a dummy node for
+ initialize_bucket() and dummy nodes don't need pinning,
+ or it's a lock of the same transaction for lockman_getlock,
+ and it cannot be removed by another thread
+ */
+ *blocker= cursor.blocker;
+ return res;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+ LOCK **blocker)
+{
+ CURSOR cursor;
+ int res;
+
+ res= lockfind(head, node, &cursor, pins);
+
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ if (blocker)
+ *blocker= cursor.blocker;
+ return res;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return all pins are removed.
+
+ One _must_ have the lock (or request) to call this
+*/
+static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins)
+{
+ CURSOR cursor;
+ int res;
+
+ do
+ {
+ res= lockfind(head, node, &cursor, pins);
+ DBUG_ASSERT(res & ALREADY_HAVE);
+
+ if (cursor.upgrade_from)
+ cursor.upgrade_from->flags&= ~IGNORE_ME;
+
+ /*
+ XXX this does not work with savepoints, as old lock is left ignored.
+ It cannot be unignored, as would basically mean moving the lock back
+ in the lock chain (from upgraded). And the latter is not allowed -
+ because it breaks list scanning. So old ignored lock must be deleted,
+ new - same - lock must be installed right after the lock we're deleting,
+ then we can delete. Good news is - this is only required when rolling
+ back a savepoint.
+ */
+ if (my_atomic_casptr((void **)&(cursor.curr->link),
+ (void **)&cursor.next, 1+(char *)cursor.next))
+ {
+ if (my_atomic_casptr((void **)cursor.prev,
+ (void **)&cursor.curr, cursor.next))
+ _lf_alloc_free(pins, cursor.curr);
+ else
+ lockfind(head, node, &cursor, pins);
+ }
+ else
+ {
+ res= REPEAT_ONCE_MORE;
+ if (cursor.upgrade_from)
+ cursor.upgrade_from->flags|= IGNORE_ME;
+ }
+ } while (res == REPEAT_ONCE_MORE);
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ _lf_unpin(pins, 3);
+ return res;
+}
+
+void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout)
+{
+ lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext));
+ lf_dynarray_init(&lm->array, sizeof(LOCK **));
+ lm->size= 1;
+ lm->count= 0;
+ lm->loid_to_lo= func;
+ lm->lock_timeout= timeout;
+}
+
+void lockman_destroy(LOCKMAN *lm)
+{
+ LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+ while (el)
+ {
+ intptr next= el->link;
+ if (el->hashnr & 1)
+ lf_alloc_direct_free(&lm->alloc, el);
+ else
+ my_free((void *)el, MYF(0));
+ el= (LOCK *)next;
+ }
+ lf_alloc_destroy(&lm->alloc);
+ lf_dynarray_destroy(&lm->array);
+}
+
+/* TODO: optimize it */
+#define MAX_LOAD 1
+
+static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node,
+ uint bucket, LF_PINS *pins)
+{
+ int res;
+ uint parent= my_clear_highest_bit(bucket);
+ LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME));
+ LOCK **tmp= 0, *cur;
+ LOCK * volatile *el= _lf_dynarray_lvalue(&lm->array, parent);
+
+ if (*el == NULL && bucket)
+ initialize_bucket(lm, el, parent, pins);
+ dummy->hashnr= my_reverse_bits(bucket);
+ dummy->loid= 0;
+ dummy->lock= X; /* doesn't matter, in fact */
+ dummy->resource= 0;
+ dummy->flags= 0;
+ res= lockinsert(el, dummy, pins, &cur);
+ DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED));
+ if (res & ALREADY_HAVE_THE_LOCK)
+ {
+ my_free((void *)dummy, MYF(0));
+ dummy= cur;
+ }
+ my_atomic_casptr((void **)node, (void **)&tmp, dummy);
+}
+
+static inline uint calc_hash(uint64 resource)
+{
+ const uchar *pos= (uchar *)&resource;
+ ulong nr1= 1, nr2= 4, i;
+ for (i= 0; i < sizeof(resource) ; i++, pos++)
+ {
+ nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8);
+ nr2+= 3;
+ }
+ return nr1 & INT_MAX32;
+}
+
+/*
+ RETURN
+ see enum lockman_getlock_result
+ NOTE
+ uses pins[0..3], they're removed on return
+*/
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+ uint64 resource,
+ enum lock_type lock)
+{
+ int res;
+ uint csize, bucket, hashnr;
+ LOCK *node, * volatile *el, *blocker;
+ LF_PINS *pins= lo->pins;
+ enum lock_type old_lock;
+
+ DBUG_ASSERT(lo->loid);
+ lf_rwlock_by_pins(pins);
+ node= (LOCK *)_lf_alloc_new(pins);
+ node->flags= 0;
+ node->lock= lock;
+ node->loid= lo->loid;
+ node->resource= resource;
+ hashnr= calc_hash(resource);
+ bucket= hashnr % lm->size;
+ el= _lf_dynarray_lvalue(&lm->array, bucket);
+ if (*el == NULL)
+ initialize_bucket(lm, el, bucket, pins);
+ node->hashnr= my_reverse_bits(hashnr) | 1;
+ res= lockinsert(el, node, pins, &blocker);
+ if (res & ALREADY_HAVE)
+ {
+ int r;
+ old_lock= blocker->lock;
+ _lf_alloc_free(pins, node);
+ lf_rwunlock_by_pins(pins);
+ r= getlock_result[old_lock][lock];
+ DBUG_ASSERT(r);
+ return r;
+ }
+ /* a new value was added to the hash */
+ csize= lm->size;
+ if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD)
+ my_atomic_cas32(&lm->size, &csize, csize*2);
+ node->lonext= lo->all_locks;
+ lo->all_locks= node;
+ for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker))
+ {
+ LOCK_OWNER *wait_for_lo;
+ ulonglong deadline;
+ struct timespec timeout;
+
+ _lf_assert_pin(pins, 3); /* blocker must be pinned here */
+ wait_for_lo= lm->loid_to_lo(blocker->loid);
+
+ /*
+ now, this is tricky. blocker is not necessarily a LOCK
+ we're waiting for. If it's compatible with what we want,
+ then we're waiting for a lock that blocker is waiting for
+ (see two places where blocker is set in lockfind)
+ In the latter case, let's "dereference" it
+ */
+ if (lock_compatibility_matrix[blocker->lock][lock])
+ {
+ blocker= wait_for_lo->all_locks;
+ _lf_pin(pins, 3, blocker);
+ if (blocker != wait_for_lo->all_locks)
+ continue;
+ wait_for_lo= wait_for_lo->waiting_for;
+ }
+
+ /*
+ note that the blocker transaction may have ended by now,
+ its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point
+ to an unrelated - albeit valid - LOCK_OWNER
+ */
+ if (!wait_for_lo)
+ continue;
+
+ lo->waiting_for= wait_for_lo;
+ lf_rwunlock_by_pins(pins);
+
+ /*
+ We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must
+ belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER,
+ if there're other active LOCK_OWNERs.
+ */
+ /* QQ: race condition here */
+ pthread_mutex_lock(wait_for_lo->mutex);
+ if (DELETED(blocker->link))
+ {
+ /*
+ blocker transaction was ended, or a savepoint that owned
+ the lock was rolled back. Either way - the lock was removed
+ */
+ pthread_mutex_unlock(wait_for_lo->mutex);
+ lf_rwlock_by_pins(pins);
+ continue;
+ }
+
+ /* yuck. waiting */
+ deadline= my_getsystime() + lm->lock_timeout * 10000;
+ timeout.tv_sec= deadline/10000000;
+ timeout.tv_nsec= (deadline % 10000000) * 100;
+ do
+ {
+ pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout);
+ } while (!DELETED(blocker->link) && my_getsystime() < deadline);
+ pthread_mutex_unlock(wait_for_lo->mutex);
+ lf_rwlock_by_pins(pins);
+ if (!DELETED(blocker->link))
+ {
+ /*
+ timeout.
+ note that we _don't_ release the lock request here.
+ Instead we're relying on the caller to abort the transaction,
+ and release all locks at once - see lockman_release_locks()
+ */
+ _lf_unpin(pins, 3);
+ lf_rwunlock_by_pins(pins);
+ return DIDNT_GET_THE_LOCK;
+ }
+ }
+ lo->waiting_for= 0;
+ _lf_assert_unpin(pins, 3); /* unpin should not be needed */
+ lf_rwunlock_by_pins(pins);
+ return getlock_result[lock][lock];
+}
+
+/*
+ RETURN
+ 0 - deleted
+ 1 - didn't (not found)
+ NOTE
+ see lockdelete() for pin usage notes
+*/
+int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo)
+{
+ LOCK * volatile *el, *node, *next;
+ uint bucket;
+ LF_PINS *pins= lo->pins;
+
+ pthread_mutex_lock(lo->mutex);
+ lf_rwlock_by_pins(pins);
+ for (node= lo->all_locks; node; node= next)
+ {
+ next= node->lonext;
+ bucket= calc_hash(node->resource) % lm->size;
+ el= _lf_dynarray_lvalue(&lm->array, bucket);
+ if (*el == NULL)
+ initialize_bucket(lm, el, bucket, pins);
+ lockdelete(el, node, pins);
+ my_atomic_add32(&lm->count, -1);
+ }
+ lf_rwunlock_by_pins(pins);
+ lo->all_locks= 0;
+ /* now signal all waiters */
+ pthread_cond_broadcast(lo->cond);
+ pthread_mutex_unlock(lo->mutex);
+ return 0;
+}
+
+#ifdef MY_LF_EXTRA_DEBUG
+static const char *lock2str[]=
+{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" };
+/*
+ NOTE
+ the function below is NOT thread-safe !!!
+*/
+void print_lockhash(LOCKMAN *lm)
+{
+ LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+ printf("hash: size %u count %u\n", lm->size, lm->count);
+ while (el)
+ {
+ intptr next= el->link;
+ if (el->hashnr & 1)
+ {
+ printf("0x%08lx { resource %lu, loid %u, lock %s",
+ (long) el->hashnr, (ulong) el->resource, el->loid,
+ lock2str[el->lock]);
+ if (el->flags & IGNORE_ME) printf(" IGNORE_ME");
+ if (el->flags & UPGRADED) printf(" UPGRADED");
+ if (el->flags & ACTIVE) printf(" ACTIVE");
+ if (DELETED(next)) printf(" ***DELETED***");
+ printf("}\n");
+ }
+ else
+ {
+ /*printf("0x%08x { dummy }\n", el->hashnr);*/
+ DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X);
+ }
+ el= PTR(next);
+ }
+}
+#endif
diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h
new file mode 100644
index 00000000000..279a5537f76
--- /dev/null
+++ b/storage/maria/lockman.h
@@ -0,0 +1,76 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _lockman_h
+#define _lockman_h
+
+/*
+ Lock levels:
+ ^^^^^^^^^^^
+
+ N - "no lock", not a lock, used sometimes internally to simplify the code
+ S - Shared
+ X - eXclusive
+ IS - Intention Shared
+ IX - Intention eXclusive
+ SIX - Shared + Intention eXclusive
+ LS - Loose Shared
+ LX - Loose eXclusive
+ SLX - Shared + Loose eXclusive
+ LSIX - Loose Shared + Intention eXclusive
+*/
+enum lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+
+struct lockman_lock;
+
+typedef struct st_lock_owner LOCK_OWNER;
+struct st_lock_owner {
+ LF_PINS *pins; /* must be allocated from lockman's pinbox */
+ struct lockman_lock *all_locks; /* a LIFO */
+ LOCK_OWNER *waiting_for;
+ pthread_cond_t *cond; /* transactions waiting for this, wait on 'cond' */
+ pthread_mutex_t *mutex; /* mutex is required to use 'cond' */
+ uint16 loid;
+};
+
+typedef LOCK_OWNER *loid_to_lo_func(uint16);
+typedef struct {
+ LF_DYNARRAY array; /* hash itself */
+ LF_ALLOCATOR alloc; /* allocator for elements */
+ int32 volatile size; /* size of array */
+ int32 volatile count; /* number of elements in the hash */
+ uint lock_timeout;
+ loid_to_lo_func *loid_to_lo;
+} LOCKMAN;
+#define DIDNT_GET_THE_LOCK 0
+enum lockman_getlock_result {
+ NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+ GOT_THE_LOCK,
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+
+void lockman_init(LOCKMAN *, loid_to_lo_func *, uint);
+void lockman_destroy(LOCKMAN *);
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+ uint64 resource,
+ enum lock_type lock);
+int lockman_release_locks(LOCKMAN *, LOCK_OWNER *);
+
+#ifdef EXTRA_DEBUG
+void print_lockhash(LOCKMAN *lm);
+#endif
+
+#endif
diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
new file mode 100644
index 00000000000..684f5e16ffa
--- /dev/null
+++ b/storage/maria/ma_bitmap.c
@@ -0,0 +1,2077 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Bitmap handling (for records in block)
+
+ The data file starts with a bitmap page, followed by as many data
+ pages as the bitmap can cover. After this there is a new bitmap page
+ and more data pages etc.
+
+ The bitmap code assumes there is always an active bitmap page and thus
+ that there is at least one bitmap page in the file
+
+ Structure of bitmap page:
+
+ Fixed size records (to be implemented later):
+
+ 2 bits are used to indicate:
+
+ 0 Empty
+ 1 0-75 % full (at least room for 2 records)
+ 2 75-100 % full (at least room for one record)
+ 3 100 % full (no more room for records)
+
+ Assuming 8K pages, this will allow us to map:
+ 8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M
+
+ (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN)
+
+ Note that for fixed size rows, we can't add more columns without doing
+ a full reorganization of the table. The user can always force a dynamic
+ size row format by specifying ROW_FORMAT=dynamic.
+
+
+ Dynamic size records:
+
+ 3 bits are used to indicate
+
+ 0 Empty page
+ 1 0-30 % full (at least room for 3 records)
+ 2 30-60 % full (at least room for 2 records)
+ 3 60-90 % full (at least room for one record)
+ 4 100 % full (no more room for records)
+ 5 Tail page, 0-40 % full
+ 6 Tail page, 40-80 % full
+ 7 Full tail page or full blob page
+
+ Assuming 8K pages, this will allow us to map:
+ 8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M
+
+ Note that values 1-3 may be adjust for each individual table based on
+ 'min record length'. Tail pages are for overflow data which can be of
+ any size and thus doesn't have to be adjusted for different tables.
+ If we add more columns to the table, some of the originally calculated
+ 'cut off' points may not be optimal, but they shouldn't be 'drasticly
+ wrong'.
+
+ When allocating data from the bitmap, we are trying to do it in a
+ 'best fit' manner. Blobs and varchar blocks are given out in large
+ continuous extents to allow fast access to these. Before allowing a
+ row to 'flow over' to other blocks, we will compact the page and use
+ all space on it. If there is many rows in the page, we will ensure
+ there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other
+ rows to grow.
+
+ The bitmap format allows us to extend the row file in big chunks, if needed.
+
+ When calculating the size for a packed row, we will calculate the following
+ things separately:
+ - Row header + null_bits + empty_bits fixed size segments etc.
+ - Size of all char/varchar fields
+ - Size of each blob field
+
+ The bitmap handler will get all the above information and return
+ either one page or a set of pages to put the different parts.
+
+ Bitmaps are read on demand in response to insert/delete/update operations.
+ The following bitmap pointers will be cached and stored on disk on close:
+ - Current insert_bitmap; When inserting new data we will first try to
+ fill this one.
+ - First bitmap which is not completely full. This is updated when we
+ free data with an update or delete.
+
+ While flushing out bitmaps, we will cache the status of the bitmap in memory
+ to avoid having to read a bitmap for insert of new data that will not
+ be of any use
+ - Total empty space
+ - Largest number of continuous pages
+
+ Bitmap ONLY goes to disk in the following scenarios
+ - The file is closed (and we flush all changes to disk)
+ - On checkpoint
+ (Ie: When we do a checkpoint, we have to ensure that all bitmaps are
+ put on disk even if they are not in the page cache).
+ - When explicitely requested (for example on backup or after recvoery,
+ to simplify things)
+
+ The flow of writing a row is that:
+ - Lock the bitmap
+ - Decide which data pages we will write to
+ - Mark them full in the bitmap page so that other threads do not try to
+ use the same data pages as us
+ - We unlock the bitmap
+ - Write the data pages
+ - Lock the bitmap
+ - Correct the bitmap page with the true final occupation of the data
+ pages (that is, we marked pages full but when we are done we realize
+ we didn't fill them)
+ - Unlock the bitmap.
+*/
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+
+/* Number of pages to store blob parts */
+#define BLOB_SEGMENT_MIN_SIZE 128
+
+#define FULL_HEAD_PAGE 4
+#define FULL_TAIL_PAGE 7
+
+/** all bitmap pages end with this 2-byte signature */
+uchar maria_bitmap_marker[2]= {(uchar) 'b',(uchar) 'm'};
+
+static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
+ MARIA_FILE_BITMAP *bitmap,
+ ulonglong page);
+
+
+/* Write bitmap page to key cache */
+
+static inline my_bool write_changed_bitmap(MARIA_SHARE *share,
+ MARIA_FILE_BITMAP *bitmap)
+{
+ DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+ return (pagecache_write(share->pagecache,
+ &bitmap->file, bitmap->page, 0,
+ (uchar*) bitmap->map, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0));
+}
+
+/*
+ Initialize bitmap variables in share
+
+ SYNOPSIS
+ _ma_bitmap_init()
+ share Share handler
+ file data file handler
+
+ NOTES
+ This is called the first time a file is opened.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
+{
+ uint aligned_bit_blocks;
+ uint max_page_size;
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ uint size= share->block_size;
+#ifndef DBUG_OFF
+ /* We want to have a copy of the bitmap to be able to print differences */
+ size*= 2;
+#endif
+
+ if (!(bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))))
+ return 1;
+
+ bitmap->file.file= file;
+ bitmap->changed= 0;
+ bitmap->block_size= share->block_size;
+ /* Size needs to be alligned on 6 */
+ aligned_bit_blocks= share->block_size / 6;
+ bitmap->total_size= aligned_bit_blocks * 6;
+ /*
+ In each 6 bytes, we have 6*8/3 = 16 pages covered
+ The +1 is to add the bitmap page, as this doesn't have to be covered
+ */
+ bitmap->pages_covered= aligned_bit_blocks * 16 + 1;
+
+ /* Update size for bits */
+ /* TODO; Make this dependent of the row size */
+ max_page_size= share->block_size - PAGE_OVERHEAD_SIZE;
+ bitmap->sizes[0]= max_page_size; /* Empty page */
+ bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100;
+ bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100;
+ bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100;
+ bitmap->sizes[4]= 0; /* Full page */
+ bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100;
+ bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100;
+ bitmap->sizes[7]= 0;
+
+ pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW);
+
+ /*
+ We can't read a page yet, as in some case we don't have an active
+ page cache yet.
+ Pretend we have a dummy, full and not changed bitmap page in memory.
+ */
+
+ bitmap->page= ~(ulonglong) 0;
+ bitmap->used_size= bitmap->total_size;
+ bfill(bitmap->map, share->block_size, 255);
+ if (share->state.first_bitmap_with_space == ~(ulonglong) 0)
+ {
+ /* Start scanning for free space from start of file */
+ share->state.first_bitmap_with_space = 0;
+ }
+ return 0;
+}
+
+
+/*
+ Free data allocated by _ma_bitmap_init
+
+ SYNOPSIS
+ _ma_bitmap_end()
+ share Share handler
+*/
+
+my_bool _ma_bitmap_end(MARIA_SHARE *share)
+{
+ my_bool res= _ma_flush_bitmap(share);
+ pthread_mutex_destroy(&share->bitmap.bitmap_lock);
+ my_free((uchar*) share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR));
+ share->bitmap.map= 0;
+ return res;
+}
+
+
+/*
+ Send updated bitmap to the page cache
+
+ SYNOPSIS
+ _ma_flush_bitmap()
+ share Share handler
+
+ NOTES
+ In the future, _ma_flush_bitmap() will be called to flush changes don't
+ by this thread (ie, checking the changed flag is ok). The reason we
+ check it again in the mutex is that if someone else did a flush at the
+ same time, we don't have to do the write.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_flush_bitmap(MARIA_SHARE *share)
+{
+ my_bool res= 0;
+ DBUG_ENTER("_ma_flush_bitmap");
+ if (share->bitmap.changed)
+ {
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ if (share->bitmap.changed)
+ {
+ res= write_changed_bitmap(share, &share->bitmap);
+ share->bitmap.changed= 0;
+ }
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Intialize bitmap in memory to a zero bitmap
+
+ SYNOPSIS
+ _ma_bitmap_delete_all()
+ share Share handler
+
+ NOTES
+ This is called on maria_delete_all_rows (truncate data file).
+*/
+
+void _ma_bitmap_delete_all(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ if (bitmap->map) /* Not in create */
+ {
+ bzero(bitmap->map, bitmap->block_size);
+ memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
+ maria_bitmap_marker, sizeof(maria_bitmap_marker));
+ bitmap->changed= 1;
+ bitmap->page= 0;
+ bitmap->used_size= bitmap->total_size;
+ }
+}
+
+
+/*
+ Return bitmap pattern for the smallest head block that can hold 'size'
+
+ SYNOPSIS
+ size_to_head_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0-3 For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size <= bitmap->sizes[3])
+ return 3;
+ if (size <= bitmap->sizes[2])
+ return 2;
+ if (size <= bitmap->sizes[1])
+ return 1;
+ DBUG_ASSERT(size <= bitmap->sizes[0]);
+ return 0;
+}
+
+
+/*
+ Return bitmap pattern for head block where there is size bytes free
+
+ SYNOPSIS
+ _ma_free_size_to_head_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0-4 (Possible bitmap patterns for head block)
+*/
+
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size < bitmap->sizes[3])
+ return 4;
+ if (size < bitmap->sizes[2])
+ return 3;
+ if (size < bitmap->sizes[1])
+ return 2;
+ return (size < bitmap->sizes[0]) ? 1 : 0;
+}
+
+
+/*
+ Return bitmap pattern for the smallest tail block that can hold 'size'
+
+ SYNOPSIS
+ size_to_tail_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0, 5 or 6 For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size <= bitmap->sizes[6])
+ return 6;
+ if (size <= bitmap->sizes[5])
+ return 5;
+ DBUG_ASSERT(size <= bitmap->sizes[0]);
+ return 0;
+}
+
+
+/*
+ Return bitmap pattern for tail block where there is size bytes free
+
+ SYNOPSIS
+ free_size_to_tail_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0, 5, 6, 7 For a description of the bitmap sizes, see the header
+*/
+
+static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size >= bitmap->sizes[0])
+ return 0; /* Revert to empty page */
+ if (size < bitmap->sizes[6])
+ return 7;
+ if (size < bitmap->sizes[5])
+ return 6;
+ return 5;
+}
+
+
+/*
+ Return size guranteed to be available on a page
+
+ SYNOPSIS
+ pattern_to_head_size()
+ bitmap Bitmap
+ pattern Pattern (0-7)
+
+ RETURN
+ 0 - block_size
+*/
+
+static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern)
+{
+ DBUG_ASSERT(pattern <= 7);
+ return bitmap->sizes[pattern];
+}
+
+
+/*
+ Print bitmap for debugging
+
+ SYNOPSIS
+ _ma_print_bitmap()
+ bitmap Bitmap to print
+
+ IMPLEMENTATION
+ Prints all changed bits since last call to _ma_print_bitmap().
+ This is done by having a copy of the last bitmap in
+ bitmap->map+bitmap->block_size.
+*/
+
+#ifndef DBUG_OFF
+
+const char *bits_to_txt[]=
+{
+ "empty", "00-30% full", "30-60% full", "60-90% full", "full",
+ "tail 00-40 % full", "tail 40-80 % full", "tail/blob full"
+};
+
+static void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap)
+{
+ uchar *pos, *end, *org_pos;
+ ulong page;
+
+ end= bitmap->map + bitmap->used_size;
+ DBUG_LOCK_FILE;
+ fprintf(DBUG_FILE,"\nBitmap page changes at page %lu\n",
+ (ulong) bitmap->page);
+
+ DBUG_ASSERT(memcmp(bitmap->map + bitmap->block_size -
+ sizeof(maria_bitmap_marker),
+ maria_bitmap_marker, sizeof(maria_bitmap_marker)) == 0);
+
+ page= (ulong) bitmap->page+1;
+ for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ;
+ pos < end ;
+ pos+= 6, org_pos+= 6)
+ {
+ ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */
+ ulonglong org_bits= uint6korr(org_pos);
+ uint i;
+
+ /*
+ Test if there is any changes in the next 16 bitmaps (to not have to
+ loop through all bits if we know they are the same)
+ */
+ if (bits != org_bits)
+ {
+ for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3)
+ {
+ if ((bits & 7) != (org_bits & 7))
+ fprintf(DBUG_FILE, "Page: %8lu %s -> %s\n", page+i,
+ bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]);
+ }
+ }
+ page+= 16;
+ }
+ fputc('\n', DBUG_FILE);
+ DBUG_UNLOCK_FILE;
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+}
+
+#endif /* DBUG_OFF */
+
+
+/***************************************************************************
+ Reading & writing bitmap pages
+***************************************************************************/
+
+/*
+ Read a given bitmap page
+
+ SYNOPSIS
+ read_bitmap_page()
+ info Maria handler
+ bitmap Bitmap handler
+ page Page to read
+
+ TODO
+ Update 'bitmap->used_size' to real size of used bitmap
+
+ NOTE
+ We don't always have share->bitmap.bitmap_lock here
+ (when called from_ma_check_bitmap_data() for example).
+
+ RETURN
+ 0 ok
+ 1 error (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
+ MARIA_FILE_BITMAP *bitmap,
+ ulonglong page)
+{
+ my_off_t end_of_page= (page + 1) * bitmap->block_size;
+ my_bool res;
+ DBUG_ENTER("_ma_read_bitmap_page");
+ DBUG_ASSERT(page % bitmap->pages_covered == 0);
+
+ bitmap->page= page;
+ if (end_of_page > share->state.state.data_file_length)
+ {
+ /*
+ Inexistent or half-created page (could be crash in the middle of
+ _ma_bitmap_create_first(), before appending maria_bitmap_marker).
+ */
+ share->state.state.data_file_length= end_of_page;
+ bzero(bitmap->map, bitmap->block_size);
+ memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
+ maria_bitmap_marker, sizeof(maria_bitmap_marker));
+ bitmap->used_size= 0;
+#ifndef DBUG_OFF
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+ DBUG_RETURN(0);
+ }
+ bitmap->used_size= bitmap->total_size;
+ DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+ res= ((pagecache_read(share->pagecache,
+ (PAGECACHE_FILE*)&bitmap->file, page, 0,
+ (uchar*) bitmap->map,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL) ||
+ memcmp(bitmap->map + bitmap->block_size -
+ sizeof(maria_bitmap_marker),
+ maria_bitmap_marker, sizeof(maria_bitmap_marker)));
+#ifndef DBUG_OFF
+ if (!res)
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Change to another bitmap page
+
+ SYNOPSIS
+ _ma_change_bitmap_page()
+ info Maria handler
+ bitmap Bitmap handler
+ page Bitmap page to read
+
+ NOTES
+ If old bitmap was changed, write it out before reading new one
+ We return empty bitmap if page is outside of file size
+
+ RETURN
+ 0 ok
+ 1 error (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_change_bitmap_page(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ ulonglong page)
+{
+ DBUG_ENTER("_ma_change_bitmap_page");
+
+ if (bitmap->changed)
+ {
+ if (write_changed_bitmap(info->s, bitmap))
+ DBUG_RETURN(1);
+ bitmap->changed= 0;
+ }
+ DBUG_RETURN(_ma_read_bitmap_page(info->s, bitmap, page));
+}
+
+
+/*
+ Read next suitable bitmap
+
+ SYNOPSIS
+ move_to_next_bitmap()
+ bitmap Bitmap handle
+
+ NOTES
+ The found bitmap may be full, so calling function may need to call this
+ repeatedly until it finds enough space.
+
+ TODO
+ Add cache of bitmaps to not read something that is not usable
+
+ RETURN
+ 0 ok
+ 1 error (either couldn't save old bitmap or read new one
+*/
+
+static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap)
+{
+ ulonglong page= bitmap->page;
+ MARIA_STATE_INFO *state= &info->s->state;
+ DBUG_ENTER("move_to_next_bitmap");
+
+ if (state->first_bitmap_with_space != ~(ulonglong) 0 &&
+ state->first_bitmap_with_space != page)
+ {
+ page= state->first_bitmap_with_space;
+ state->first_bitmap_with_space= ~(ulonglong) 0;
+ }
+ else
+ page+= bitmap->pages_covered;
+ DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page));
+}
+
+
+/****************************************************************************
+ Allocate data in bitmaps
+****************************************************************************/
+
+/*
+ Store data in 'block' and mark the place used in the bitmap
+
+ SYNOPSIS
+ fill_block()
+ bitmap Bitmap handle
+ block Store data about what we found
+ best_data Pointer to best 6 uchar aligned area in bitmap->map
+ best_pos Which bit in *best_data the area starts
+ 0 = first bit pattern, 1 second bit pattern etc
+ best_bits The original value of the bits at best_pos
+ fill_pattern Bitmap pattern to store in best_data[best_pos]
+
+ NOTES
+ We mark all pages to be 'TAIL's, which means that
+ block->page_count is really a row position inside the page.
+*/
+
+static void fill_block(MARIA_FILE_BITMAP *bitmap,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *best_data, uint best_pos, uint best_bits,
+ uint fill_pattern)
+{
+ uint page, offset, tmp;
+ uchar *data;
+
+ /* For each 6 bytes we have 6*8/3= 16 patterns */
+ page= (best_data - bitmap->map) / 6 * 16 + best_pos;
+ block->page= bitmap->page + 1 + page;
+ block->page_count= 1 + TAIL_BIT;
+ block->empty_space= pattern_to_size(bitmap, best_bits);
+ block->sub_blocks= 1;
+ block->org_bitmap_value= best_bits;
+ block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */
+
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ best_pos*= 3;
+ data= best_data+ best_pos / 8;
+ offset= best_pos & 7;
+ tmp= uint2korr(data);
+
+ /* we turn off the 3 bits and replace them with fill_pattern */
+ tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+ int2store(data, tmp);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+}
+
+
+/*
+ Allocate data for head block
+
+ SYNOPSIS
+ allocate_head()
+ bitmap bitmap
+ size Size of data region we need to store
+ block Store found information here
+
+ IMPLEMENTATION
+ Find the best-fit page to put a region of 'size'
+ This is defined as the first page of the set of pages
+ with the smallest free space that can hold 'size'.
+
+ RETURN
+ 0 ok (block is updated)
+ 1 error (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size,
+ MARIA_BITMAP_BLOCK *block)
+{
+ uint min_bits= size_to_head_pattern(bitmap, size);
+ uchar *data= bitmap->map, *end= data + bitmap->used_size;
+ uchar *best_data= 0;
+ uint best_bits= (uint) -1, best_pos;
+ DBUG_ENTER("allocate_head");
+
+ LINT_INIT(best_pos);
+ DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size));
+
+ for (; data < end; data += 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uint i;
+
+ /*
+ Skip common patterns
+ We can skip empty pages (if we already found a match) or
+ anything matching the following pattern as this will be either
+ a full page or a tail page
+ */
+ if ((!bits && best_data) ||
+ ((bits & LL(04444444444444444)) == LL(04444444444444444)))
+ continue;
+ for (i= 0; i < 16 ; i++, bits >>= 3)
+ {
+ uint pattern= bits & 7;
+ if (pattern <= min_bits)
+ {
+ /* There is enough space here */
+ if (pattern == min_bits)
+ {
+ /* There is exactly enough space here, return this page */
+ best_bits= min_bits;
+ best_data= data;
+ best_pos= i;
+ goto found;
+ }
+ if ((int) pattern > (int) best_bits)
+ {
+ /*
+ There is more than enough space here and it's better than what
+ we have found so far. Remember it, as we will choose it if we
+ don't find anything in this bitmap page.
+ */
+ best_bits= pattern;
+ best_data= data;
+ best_pos= i;
+ }
+ }
+ }
+ }
+ if (!best_data) /* Found no place */
+ {
+ if (bitmap->used_size == bitmap->total_size)
+ DBUG_RETURN(1); /* No space in bitmap */
+ /* Allocate data at end of bitmap */
+ bitmap->used_size+= 6;
+ best_data= data;
+ best_pos= best_bits= 0;
+ }
+
+found:
+ fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Allocate data for tail block
+
+ SYNOPSIS
+ allocate_tail()
+ bitmap bitmap
+ size Size of block we need to find
+ block Store found information here
+
+ RETURN
+ 0 ok (block is updated)
+ 1 error (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size,
+ MARIA_BITMAP_BLOCK *block)
+{
+ uint min_bits= size_to_tail_pattern(bitmap, size);
+ uchar *data= bitmap->map, *end= data + bitmap->used_size;
+ uchar *best_data= 0;
+ uint best_bits= (uint) -1, best_pos;
+ DBUG_ENTER("allocate_tail");
+ DBUG_PRINT("enter", ("size: %u", size));
+
+ LINT_INIT(best_pos);
+ DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size));
+
+ for (; data < end; data += 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uint i;
+
+ /*
+ Skip common patterns
+ We can skip empty pages (if we already found a match) or
+ the following patterns: 1-4 (head pages, not suitable for tail) or
+ 7 (full tail page). See 'Dynamic size records' comment at start of file.
+
+ At the moment we only skip full tail pages (ie, all bits are
+ set) as this is easy to detect with one simple test and is a
+ quite common case if we have blobs.
+ */
+
+ if ((!bits && best_data) || bits == LL(0xffffffffffff))
+ continue;
+ for (i= 0; i < 16; i++, bits >>= 3)
+ {
+ uint pattern= bits & 7;
+ if (pattern <= min_bits && (!pattern || pattern >= 5))
+ {
+ if (pattern == min_bits)
+ {
+ best_bits= min_bits;
+ best_data= data;
+ best_pos= i;
+ goto found;
+ }
+ if ((int) pattern > (int) best_bits)
+ {
+ best_bits= pattern;
+ best_data= data;
+ best_pos= i;
+ }
+ }
+ }
+ }
+ if (!best_data)
+ {
+ if (bitmap->used_size == bitmap->total_size)
+ DBUG_RETURN(1);
+ /* Allocate data at end of bitmap */
+ best_data= end;
+ bitmap->used_size+= 6;
+ best_pos= best_bits= 0;
+ }
+
+found:
+ fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Allocate data for full blocks
+
+ SYNOPSIS
+ allocate_full_pages()
+ bitmap bitmap
+ pages_needed Total size in pages (bitmap->total_size) we would like to have
+ block Store found information here
+ full_page 1 if we are not allowed to split extent
+
+ IMPLEMENTATION
+ We will return the smallest area >= size. If there is no such
+ block, we will return the biggest area that satisfies
+ area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size)
+
+ To speed up searches, we will only consider areas that has at least 16 free
+ pages starting on an even boundary. When finding such an area, we will
+ extend it with all previous and following free pages. This will ensure
+ we don't get holes between areas
+
+ RETURN
+ # Blocks used
+ 0 error (no space in bitmap; block is not touched)
+*/
+
+static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap,
+ ulong pages_needed,
+ MARIA_BITMAP_BLOCK *block, my_bool full_page)
+{
+ uchar *data= bitmap->map, *data_end= data + bitmap->used_size;
+ uchar *page_end= data + bitmap->total_size;
+ uchar *best_data= 0;
+ uint min_size;
+ uint best_area_size, best_prefix_area_size, best_suffix_area_size;
+ uint page, size;
+ ulonglong best_prefix_bits;
+ DBUG_ENTER("allocate_full_pages");
+ DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed));
+
+ /* Following variables are only used if best_data is set */
+ LINT_INIT(best_prefix_bits);
+ LINT_INIT(best_prefix_area_size);
+ LINT_INIT(best_suffix_area_size);
+
+ min_size= pages_needed;
+ if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE)
+ min_size= BLOB_SEGMENT_MIN_SIZE;
+ best_area_size= ~(uint) 0;
+
+ for (; data < page_end; data+= 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uchar *data_start;
+ ulonglong prefix_bits= 0;
+ uint area_size, prefix_area_size, suffix_area_size;
+
+ /* Find area with at least 16 free pages */
+ if (bits)
+ continue;
+ data_start= data;
+ /* Find size of area */
+ for (data+=6 ; data < data_end ; data+= 6)
+ {
+ if ((bits= uint6korr(data)))
+ break;
+ }
+ area_size= (data - data_start) / 6 * 16;
+ if (area_size >= best_area_size)
+ continue;
+ prefix_area_size= suffix_area_size= 0;
+ if (!bits)
+ {
+ /*
+ End of page; All the rest of the bits on page are part of area
+ This is needed because bitmap->used_size only covers the set bits
+ in the bitmap.
+ */
+ area_size+= (page_end - data) / 6 * 16;
+ if (area_size >= best_area_size)
+ break;
+ data= page_end;
+ }
+ else
+ {
+ /* Add bits at end of page */
+ for (; !(bits & 7); bits >>= 3)
+ suffix_area_size++;
+ area_size+= suffix_area_size;
+ }
+ if (data_start != bitmap->map)
+ {
+ /* Add bits before page */
+ bits= prefix_bits= uint6korr(data_start - 6);
+ DBUG_ASSERT(bits != 0);
+ /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */
+ if (!(bits & LL(07000000000000000)))
+ {
+ data_start-= 6;
+ do
+ {
+ prefix_area_size++;
+ bits<<= 3;
+ } while (!(bits & LL(07000000000000000)));
+ area_size+= prefix_area_size;
+ /* Calculate offset to page from data_start */
+ prefix_area_size= 16 - prefix_area_size;
+ }
+ }
+ if (area_size >= min_size && area_size <= best_area_size)
+ {
+ best_data= data_start;
+ best_area_size= area_size;
+ best_prefix_bits= prefix_bits;
+ best_prefix_area_size= prefix_area_size;
+ best_suffix_area_size= suffix_area_size;
+
+ /* Prefer to put data in biggest possible area */
+ if (area_size <= pages_needed)
+ min_size= area_size;
+ else
+ min_size= pages_needed;
+ }
+ }
+ if (!best_data)
+ DBUG_RETURN(0); /* No room on page */
+
+ /*
+ Now allocate min(pages_needed, area_size), starting from
+ best_start + best_prefix_area_size
+ */
+ if (best_area_size > pages_needed)
+ best_area_size= pages_needed;
+
+ /* For each 6 bytes we have 6*8/3= 16 patterns */
+ page= ((best_data - bitmap->map) * 8) / 3 + best_prefix_area_size;
+ block->page= bitmap->page + 1 + page;
+ block->page_count= best_area_size;
+ block->empty_space= 0;
+ block->sub_blocks= 1;
+ block->org_bitmap_value= 0;
+ block->used= 0;
+ DBUG_PRINT("info", ("page: %lu page_count: %u",
+ (ulong) block->page, block->page_count));
+
+ if (best_prefix_area_size)
+ {
+ ulonglong tmp;
+ /* Convert offset back to bits */
+ best_prefix_area_size= 16 - best_prefix_area_size;
+ if (best_area_size < best_prefix_area_size)
+ {
+ tmp= (LL(1) << best_area_size*3) - 1;
+ best_area_size= best_prefix_area_size; /* for easy end test */
+ }
+ else
+ tmp= (LL(1) << best_prefix_area_size*3) - 1;
+ tmp<<= (16 - best_prefix_area_size) * 3;
+ DBUG_ASSERT((best_prefix_bits & tmp) == 0);
+ best_prefix_bits|= tmp;
+ int6store(best_data, best_prefix_bits);
+ if (!(best_area_size-= best_prefix_area_size))
+ {
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+ DBUG_RETURN(block->page_count);
+ }
+ best_data+= 6;
+ }
+ best_area_size*= 3; /* Bits to set */
+ size= best_area_size/8; /* Bytes to set */
+ bfill(best_data, size, 255);
+ best_data+= size;
+ if ((best_area_size-= size * 8))
+ {
+ /* fill last uchar */
+ *best_data|= (uchar) ((1 << best_area_size) -1);
+ best_data++;
+ }
+ if (data_end < best_data)
+ bitmap->used_size= (uint) (best_data - bitmap->map);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+ DBUG_RETURN(block->page_count);
+}
+
+
+/****************************************************************************
+ Find right bitmaps where to store data
+****************************************************************************/
+
+/*
+ Find right bitmap and position for head block
+
+ SYNOPSIS
+ find_head()
+ info Maria handler
+ length Size of data region we need store
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_head(MARIA_HA *info, uint length, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ /*
+ There is always place for the head block in bitmap_blocks as these are
+ preallocated at _ma_init_block_record().
+ */
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ while (allocate_head(bitmap, length, block))
+ if (move_to_next_bitmap(info, bitmap))
+ return 1;
+ return 0;
+}
+
+
+/*
+ Find right bitmap and position for tail
+
+ SYNOPSIS
+ find_tail()
+ info Maria handler
+ length Size of data region we need store
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_tail(MARIA_HA *info, uint length, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ DBUG_ENTER("find_tail");
+
+ /* Needed, as there is no error checking in dynamic_element */
+ if (allocate_dynamic(&info->bitmap_blocks, position))
+ DBUG_RETURN(1);
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ while (allocate_tail(bitmap, length, block))
+ if (move_to_next_bitmap(info, bitmap))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find right bitmap and position for full blocks in one extent
+
+ SYNOPSIS
+ find_mid()
+ info Maria handler.
+ pages How many pages to allocate.
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+ NOTES
+ This is used to allocate the main extent after the 'head' block
+ (Ie, the middle part of the head-middle-tail entry)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_mid(MARIA_HA *info, ulong pages, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ while (!allocate_full_pages(bitmap, pages, block, 1))
+ {
+ if (move_to_next_bitmap(info, bitmap))
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
+ Find right bitmap and position for putting a blob
+
+ SYNOPSIS
+ find_blob()
+ info Maria handler.
+ length Length of the blob
+
+ NOTES
+ The extents are stored last in info->bitmap_blocks
+
+ IMPLEMENTATION
+ Allocate all full pages for the block + optionally one tail
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_blob(MARIA_HA *info, ulong length)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint full_page_size= FULL_PAGE_SIZE(info->s->block_size);
+ ulong pages;
+ uint rest_length, used;
+ uint first_block_pos;
+ MARIA_BITMAP_BLOCK *first_block= 0;
+ DBUG_ENTER("find_blob");
+ DBUG_PRINT("enter", ("length: %lu", length));
+
+ pages= length / full_page_size;
+ rest_length= (uint) (length - pages * full_page_size);
+ if (rest_length >= MAX_TAIL_SIZE(info->s->block_size))
+ {
+ pages++;
+ rest_length= 0;
+ }
+
+ if (pages)
+ {
+ MARIA_BITMAP_BLOCK *block;
+ if (allocate_dynamic(&info->bitmap_blocks,
+ info->bitmap_blocks.elements +
+ pages / BLOB_SEGMENT_MIN_SIZE + 2))
+ DBUG_RETURN(1);
+ first_block_pos= info->bitmap_blocks.elements;
+ block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements,
+ MARIA_BITMAP_BLOCK*);
+ first_block= block;
+ do
+ {
+ used= allocate_full_pages(bitmap,
+ (pages >= 65535 ? 65535 : (uint) pages), block,
+ 0);
+ if (!used)
+ {
+ if (move_to_next_bitmap(info, bitmap))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ pages-= used;
+ info->bitmap_blocks.elements++;
+ block++;
+ }
+ } while (pages != 0);
+ }
+ if (rest_length && find_tail(info, rest_length,
+ info->bitmap_blocks.elements++))
+ DBUG_RETURN(1);
+ if (first_block)
+ first_block->sub_blocks= info->bitmap_blocks.elements - first_block_pos;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find pages to put ALL blobs
+
+ SYNOPSIS
+ allocate_blobs()
+ info Maria handler
+ row Information of what is in the row (from calc_record_size())
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row)
+{
+ ulong *length, *end;
+ uint elements;
+ /*
+ Reserve size for:
+ head block
+ one extent
+ tail block
+ */
+ elements= info->bitmap_blocks.elements;
+ for (length= row->blob_lengths, end= length + info->s->base.blobs;
+ length < end; length++)
+ {
+ if (*length && find_blob(info, *length))
+ return 1;
+ }
+ row->extents_count= (info->bitmap_blocks.elements - elements);
+ return 0;
+}
+
+
+/*
+ Store in the bitmap the new size for a head page
+
+ SYNOPSIS
+ use_head()
+ info Maria handler
+ page Page number to update
+ (Note that caller guarantees this is in the active
+ bitmap)
+ size How much free space is left on the page
+ block_position In which info->bitmap_block we have the
+ information about the head block.
+
+ NOTES
+ This is used on update where we are updating an existing head page
+*/
+
+static void use_head(MARIA_HA *info, ulonglong page, uint size,
+ uint block_position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ uchar *data;
+ uint offset, tmp, offset_page;
+
+ block= dynamic_element(&info->bitmap_blocks, block_position,
+ MARIA_BITMAP_BLOCK*);
+ block->page= page;
+ block->page_count= 1 + TAIL_BIT;
+ block->empty_space= size;
+ block->sub_blocks= 1;
+ block->used= BLOCKUSED_TAIL;
+
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page= (uint) (page - bitmap->page - 1) * 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ tmp= uint2korr(data);
+ block->org_bitmap_value= (tmp >> offset) & 7;
+ tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset);
+ int2store(data, tmp);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+}
+
+
+/*
+ Find out where to split the row (ie, what goes in head, middle, tail etc)
+
+ SYNOPSIS
+ find_where_to_split_row()
+ share Maria share
+ row Information of what is in the row (from calc_record_size())
+ extents_length Number of bytes needed to store all extents
+ split_size Free size on the page (The head length must be less
+ than this)
+
+ RETURN
+ row_length for the head block.
+*/
+
+static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row,
+ uint extents_length, uint split_size)
+{
+ uint row_length= row->base_length;
+ uint *lengths, *lengths_end;
+
+ DBUG_ASSERT(row_length < split_size);
+ /*
+ Store first in all_field_lengths the different parts that are written
+ to the row. This needs to be in same order as in
+ ma_block_rec.c::write_block_record()
+ */
+ row->null_field_lengths[-3]= extents_length;
+ row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length;
+ row->null_field_lengths[-1]= row->field_lengths_length;
+ for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS,
+ lengths_end= (lengths + share->base.pack_fields - share->base.blobs +
+ EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++)
+ {
+ if (row_length + *lengths > split_size)
+ break;
+ row_length+= *lengths;
+ }
+ return row_length;
+}
+
+
+/*
+ Find where to write the middle parts of the row and the tail
+
+ SYNOPSIS
+ write_rest_of_head()
+ info Maria handler
+ position Position in bitmap_blocks. Is 0 for rows that needs
+ full blocks (ie, has a head, middle part and optional tail)
+ rest_length How much left of the head block to write.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool write_rest_of_head(MARIA_HA *info, uint position,
+ ulong rest_length)
+{
+ MARIA_SHARE *share= info->s;
+ uint full_page_size= FULL_PAGE_SIZE(share->block_size);
+ MARIA_BITMAP_BLOCK *block;
+ DBUG_ENTER("write_rest_of_head");
+ DBUG_PRINT("enter", ("position: %u rest_length: %lu", position,
+ rest_length));
+
+ if (position == 0)
+ {
+ /* Write out full pages */
+ uint pages= rest_length / full_page_size;
+
+ rest_length%= full_page_size;
+ if (rest_length >= MAX_TAIL_SIZE(share->block_size))
+ {
+ /* Put tail on a full page */
+ pages++;
+ rest_length= 0;
+ }
+ if (find_mid(info, pages, 1))
+ DBUG_RETURN(1);
+ /*
+ Insert empty block after full pages, to allow write_block_record() to
+ split segment into used + free page
+ */
+ block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*);
+ block->page_count= 0;
+ block->used= 0;
+ }
+ if (rest_length)
+ {
+ if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ /* Empty tail block */
+ block= dynamic_element(&info->bitmap_blocks,
+ ELEMENTS_RESERVED_FOR_MAIN_PART - 1,
+ MARIA_BITMAP_BLOCK *);
+ block->page_count= 0;
+ block->used= 0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find where to store one row
+
+ SYNPOSIS
+ _ma_bitmap_find_place()
+ info Maria handler
+ row Information about row to write
+ blocks Store data about allocated places here
+
+ RETURN
+ 0 ok
+ row->space_on_head_page contains minimum number of bytes we
+ expect to put on the head page.
+ 1 error
+*/
+
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool res= 1;
+ uint full_page_size, position, max_page_size;
+ uint head_length, row_length, rest_length, extents_length;
+ DBUG_ENTER("_ma_bitmap_find_place");
+
+ blocks->count= 0;
+ blocks->tail_page_skipped= blocks->page_skipped= 0;
+ row->extents_count= 0;
+
+ /*
+ Reserve place for the following blocks:
+ - Head block
+ - Full page block
+ - Marker block to allow write_block_record() to split full page blocks
+ into full and free part
+ - Tail block
+ */
+
+ info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+ max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE);
+
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+
+ if (row->total_length <= max_page_size)
+ {
+ /* Row fits in one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ if (find_head(info, (uint) row->total_length, position))
+ goto abort;
+ row->space_on_head_page= row->total_length;
+ goto end;
+ }
+
+ /*
+ First allocate all blobs (so that we can find out the needed size for
+ the main block.
+ */
+ if (row->blob_length && allocate_blobs(info, row))
+ goto abort;
+
+ extents_length= row->extents_count * ROW_EXTENT_SIZE;
+ if ((head_length= (row->head_length + extents_length)) <= max_page_size)
+ {
+ /* Main row part fits into one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ if (find_head(info, head_length, position))
+ goto abort;
+ row->space_on_head_page= head_length;
+ goto end;
+ }
+
+ /* Allocate enough space */
+ head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+ /* The first segment size is stored in 'row_length' */
+ row_length= find_where_to_split_row(share, row, extents_length,
+ max_page_size);
+
+ full_page_size= FULL_PAGE_SIZE(share->block_size);
+ position= 0;
+ if (head_length - row_length <= full_page_size)
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */
+ if (find_head(info, row_length, position))
+ goto abort;
+ row->space_on_head_page= row_length;
+ rest_length= head_length - row_length;
+ if (write_rest_of_head(info, position, rest_length))
+ goto abort;
+
+end:
+ blocks->block= dynamic_element(&info->bitmap_blocks, position,
+ MARIA_BITMAP_BLOCK*);
+ blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+ /* First block's page_count is for all blocks */
+ blocks->count= info->bitmap_blocks.elements - position;
+ res= 0;
+
+abort:
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Find where to put row on update (when head page is already defined)
+
+ SYNPOSIS
+ _ma_bitmap_find_new_place()
+ info Maria handler
+ row Information about row to write
+ page On which page original row was stored
+ free_size Free size on head page
+ blocks Store data about allocated places here
+
+ NOTES
+ This function is only called when the new row can't fit in the space of
+ the old row in the head page.
+
+ This is essently same as _ma_bitmap_find_place() except that
+ we don't call find_head() to search in bitmaps where to put the page.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row,
+ ulonglong page, uint free_size,
+ MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool res= 1;
+ uint full_page_size, position;
+ uint head_length, row_length, rest_length, extents_length;
+ DBUG_ENTER("_ma_bitmap_find_new_place");
+
+ blocks->count= 0;
+ blocks->tail_page_skipped= blocks->page_skipped= 0;
+ row->extents_count= 0;
+ info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ if (share->bitmap.page != page / share->bitmap.pages_covered &&
+ _ma_change_bitmap_page(info, &share->bitmap,
+ page / share->bitmap.pages_covered))
+ goto abort;
+
+ /*
+ First allocate all blobs (so that we can find out the needed size for
+ the main block.
+ */
+ if (row->blob_length && allocate_blobs(info, row))
+ goto abort;
+
+ extents_length= row->extents_count * ROW_EXTENT_SIZE;
+ if ((head_length= (row->head_length + extents_length)) <= free_size)
+ {
+ /* Main row part fits into one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ use_head(info, page, head_length, position);
+ goto end;
+ }
+
+ /* Allocate enough space */
+ head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+ /* The first segment size is stored in 'row_length' */
+ row_length= find_where_to_split_row(share, row, extents_length, free_size);
+
+ full_page_size= FULL_PAGE_SIZE(share->block_size);
+ position= 0;
+ if (head_length - row_length <= full_page_size)
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */
+ use_head(info, page, row_length, position);
+ rest_length= head_length - row_length;
+
+ if (write_rest_of_head(info, position, rest_length))
+ goto abort;
+
+end:
+ blocks->block= dynamic_element(&info->bitmap_blocks, position,
+ MARIA_BITMAP_BLOCK*);
+ blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+ /* First block's page_count is for all blocks */
+ blocks->count= info->bitmap_blocks.elements - position;
+ res= 0;
+
+abort:
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/****************************************************************************
+ Clear and reset bits
+****************************************************************************/
+
+/*
+ Set fill pattern for a page
+
+ set_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Adress to page
+ fill_pattern Pattern (not size) for page
+
+ NOTES
+ Page may not be part of active bitmap
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ ulonglong page, uint fill_pattern)
+{
+ ulonglong bitmap_page;
+ uint offset_page, offset, tmp, org_tmp;
+ uchar *data;
+ DBUG_ENTER("set_page_bits");
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(1);
+
+ /* Find page number from start of bitmap */
+ offset_page= page - bitmap->page - 1;
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page*= 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ org_tmp= tmp= uint2korr(data);
+ tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+ if (tmp == org_tmp)
+ DBUG_RETURN(0); /* No changes */
+ int2store(data, tmp);
+
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+ if (fill_pattern != 3 && fill_pattern != 7)
+ set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+ /*
+ Note that if the condition above is false (page is full), and all pages of
+ this bitmap are now full, and that bitmap page was
+ first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed
+ its value still tells us where to start our search for a bitmap with space
+ (which is for sure after this full one).
+ That does mean that first_bitmap_with_space is only a lower bound.
+ */
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get bitmap pattern for a given page
+
+ SYNOPSIS
+ get_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Page number
+
+ RETURN
+ 0-7 Bitmap pattern
+ ~0 Error (couldn't read page)
+*/
+
+static uint get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ ulonglong page)
+{
+ ulonglong bitmap_page;
+ uint offset_page, offset, tmp;
+ uchar *data;
+ DBUG_ENTER("get_page_bits");
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(~ (uint) 0);
+
+ /* Find page number from start of bitmap */
+ offset_page= page - bitmap->page - 1;
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page*= 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ tmp= uint2korr(data);
+ DBUG_RETURN((tmp >> offset) & 7);
+}
+
+
+/*
+ Mark all pages in a region as free
+
+ SYNOPSIS
+ _ma_reset_full_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Start page
+ page_count Number of pages
+
+ NOTES
+ We assume that all pages in region is covered by same bitmap
+ One must have a lock on info->s->bitmap.bitmap_lock
+
+ RETURN
+ 0 ok
+ 1 Error (when reading bitmap)
+*/
+
+my_bool _ma_reset_full_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ ulonglong page, uint page_count)
+{
+ ulonglong bitmap_page;
+ uint offset, bit_start, bit_count, tmp;
+ uchar *data;
+ DBUG_ENTER("_ma_reset_full_page_bits");
+ DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count));
+ safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock);
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(1);
+
+ /* Find page number from start of bitmap */
+ page= page - bitmap->page - 1;
+
+ /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */
+ bit_start= page * 3;
+ bit_count= page_count * 3;
+
+ data= bitmap->map + bit_start / 8;
+ offset= bit_start & 7;
+
+ tmp= (255 << offset); /* Bits to keep */
+ if (bit_count + offset < 8)
+ {
+ /* Only clear bits between 'offset' and 'offset+bit_count-1' */
+ tmp^= (255 << (offset + bit_count));
+ }
+ *data&= ~tmp;
+
+ if ((int) (bit_count-= (8 - offset)) > 0)
+ {
+ uint fill;
+ data++;
+ /*
+ -1 is here to avoid one 'if' statement and to let the following code
+ handle the last byte
+ */
+ if ((fill= (bit_count - 1) / 8))
+ {
+ bzero(data, fill);
+ data+= fill;
+ }
+ bit_count-= fill * 8; /* Bits left to clear */
+ tmp= (1 << bit_count) - 1;
+ *data&= ~tmp;
+ }
+ set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap););
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Correct bitmap pages to reflect the true allocation
+
+ SYNOPSIS
+ _ma_bitmap_release_unused()
+ info Maria handle
+ blocks Bitmap blocks
+
+ IMPLEMENTATION
+ If block->used & BLOCKUSED_TAIL is set:
+ If block->used & BLOCKUSED_USED is set, then the bits for the
+ corresponding page is set according to block->empty_space
+ If block->used & BLOCKUSED_USED is not set, then the bits for
+ the corresponding page is set to org_bitmap_value;
+
+ If block->used & BLOCKUSED_TAIL is not set:
+ if block->used is not set, the bits for the corresponding page are
+ cleared
+
+ For the first block (head block) the logic is same as for a tail block
+
+ Note that we may have 'filler blocks' that are used to split a block
+ in half; These can be recognized by that they have page_count == 0.
+
+ RETURN
+ 0 ok
+ 1 error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count;
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint bits, current_bitmap_value;
+ DBUG_ENTER("_ma_bitmap_release_unused");
+
+ /*
+ We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full'
+ when we allocated space in the page
+ */
+ current_bitmap_value= FULL_HEAD_PAGE;
+
+ pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+
+ /* First handle head block */
+ if (block->used & BLOCKUSED_USED)
+ {
+ DBUG_PRINT("info", ("head empty_space: %u", block->empty_space));
+ bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space);
+ if (block->used & BLOCKUSED_USE_ORG_BITMAP)
+ current_bitmap_value= block->org_bitmap_value;
+ }
+ else
+ bits= block->org_bitmap_value;
+ if (bits != current_bitmap_value &&
+ set_page_bits(info, bitmap, block->page, bits))
+ goto err;
+
+
+ /* Handle all full pages and tail pages (for head page and blob) */
+ for (block++; block < end; block++)
+ {
+ uint page_count;
+ if (!block->page_count)
+ continue; /* Skip 'filler blocks' */
+
+ page_count= block->page_count;
+ if (block->used & BLOCKUSED_TAIL)
+ {
+ /* The bitmap page is only one page */
+ page_count= 1;
+ if (block->used & BLOCKUSED_USED)
+ {
+ DBUG_PRINT("info", ("tail empty_space: %u", block->empty_space));
+ bits= free_size_to_tail_pattern(bitmap, block->empty_space);
+ }
+ else
+ bits= block->org_bitmap_value;
+
+ /*
+ The page has all bits set; The following test is an optimization
+ to not set the bits to the same value as before.
+ */
+ if (bits != FULL_TAIL_PAGE &&
+ set_page_bits(info, bitmap, block->page, bits))
+ goto err;
+ }
+ if (!(block->used & BLOCKUSED_USED) &&
+ _ma_reset_full_page_bits(info, bitmap,
+ block->page, page_count))
+ goto err;
+ }
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(0);
+
+err:
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Free full pages from bitmap and pagecache
+
+ SYNOPSIS
+ _ma_bitmap_free_full_pages()
+ info Maria handle
+ extents Extents (as stored on disk)
+ count Number of extents
+
+ IMPLEMENTATION
+ Mark all full pages (not tails) from extents as free, both in bitmap
+ and page cache.
+
+ RETURN
+ 0 ok
+ 1 error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+ uint count)
+{
+ DBUG_ENTER("_ma_bitmap_free_full_pages");
+
+ pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+ for (; count--; extents += ROW_EXTENT_SIZE)
+ {
+ ulonglong page= uint5korr(extents);
+ uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+ if (!(page_count & TAIL_BIT))
+ {
+ if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page,
+ page_count, PAGECACHE_LOCK_WRITE, 1))
+ DBUG_RETURN(1);
+ if (_ma_reset_full_page_bits(info, &info->s->bitmap, page, page_count))
+ {
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Mark in the bitmap how much free space there is on a page
+
+ SYNOPSIS
+ _ma_bitmap_set()
+ info Mari handler
+ page Adress to page
+ head 1 if page is a head page, 0 if tail page
+ empty_space How much empty space there is on page
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_set(MARIA_HA *info, ulonglong page, my_bool head,
+ uint empty_space)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint bits;
+ my_bool res;
+ DBUG_ENTER("_ma_bitmap_set");
+
+ pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+ bits= (head ?
+ _ma_free_size_to_head_pattern(bitmap, empty_space) :
+ free_size_to_tail_pattern(bitmap, empty_space));
+ res= set_page_bits(info, bitmap, page, bits);
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Check that bitmap pattern is correct for a page
+
+ NOTES
+ Used in maria_chk
+
+ SYNOPSIS
+ _ma_check_bitmap_data()
+ info Maria handler
+ page_type What kind of page this is
+ page Adress to page
+ empty_space Empty space on page
+ bitmap_pattern Store here the pattern that was in the bitmap for the
+ page. This is always updated.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+ enum en_page_type page_type, ulonglong page,
+ uint empty_space, uint *bitmap_pattern)
+{
+ uint bits;
+ switch (page_type) {
+ case UNALLOCATED_PAGE:
+ case MAX_PAGE_TYPE:
+ bits= 0;
+ break;
+ case HEAD_PAGE:
+ bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space);
+ break;
+ case TAIL_PAGE:
+ bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space);
+ break;
+ case BLOB_PAGE:
+ bits= FULL_TAIL_PAGE;
+ break;
+ }
+ return (*bitmap_pattern= get_page_bits(info, &info->s->bitmap, page)) !=
+ bits;
+}
+
+
+/*
+ Check if the page type matches the one that we have in the bitmap
+
+ SYNOPSIS
+ _ma_check_if_right_bitmap_type()
+ info Maria handler
+ page_type What kind of page this is
+ page Adress to page
+ bitmap_pattern Store here the pattern that was in the bitmap for the
+ page. This is always updated.
+
+ NOTES
+ Used in maria_chk
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+ enum en_page_type page_type,
+ ulonglong page,
+ uint *bitmap_pattern)
+{
+ if ((*bitmap_pattern= get_page_bits(info, &info->s->bitmap, page)) > 7)
+ return 1; /* Couldn't read page */
+ switch (page_type) {
+ case HEAD_PAGE:
+ return *bitmap_pattern < 1 || *bitmap_pattern > 4;
+ case TAIL_PAGE:
+ return *bitmap_pattern < 5;
+ case BLOB_PAGE:
+ return *bitmap_pattern != 7;
+ default:
+ break;
+ }
+ DBUG_ASSERT(0);
+ return 1;
+}
+
+
+/**
+ @brief create the first bitmap page of a freshly created data file
+
+ @param share table's share
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int _ma_bitmap_create_first(MARIA_SHARE *share)
+{
+ uint block_size= share->bitmap.block_size;
+ File file= share->bitmap.file.file;
+ if (my_chsize(file, block_size - sizeof(maria_bitmap_marker),
+ 0, MYF(MY_WME)) ||
+ my_pwrite(file, maria_bitmap_marker, sizeof(maria_bitmap_marker),
+ block_size - sizeof(maria_bitmap_marker),
+ MYF(MY_NABP | MY_WME)))
+ return 1;
+ share->state.state.data_file_length= block_size;
+ _ma_bitmap_delete_all(share);
+ return 0;
+}
diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c
new file mode 100644
index 00000000000..b12035c9cfa
--- /dev/null
+++ b/storage/maria/ma_blockrec.c
@@ -0,0 +1,5279 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Storage of records in block
+
+ Some clarifications about the abbrev used:
+
+ NULL fields -> Fields that may have contain a NULL value.
+ Not null fields -> Fields that may not contain a NULL value.
+ Critical fields -> Fields that can't be null and can't be dropped without
+ causing a table reorganization.
+
+
+ Maria will have a LSN at start of each page (excluding the bitmap pages)
+
+ The different page types that are in a data file are:
+
+ Bitmap pages Map of free pages in the next extent (8192 page size
+ gives us 256M of mapped pages / bitmap)
+ Head page Start of rows are stored on this page.
+ A rowid always points to a head page
+ Blob page This page is totally filled with data from one blob or by
+ a set of long VARCHAR/CHAR fields
+ Tail page This contains the last part from different rows, blobs
+ or varchar fields.
+
+ The data file starts with a bitmap page, followed by as many data
+ pages as the bitmap can cover. After this there is a new bitmap page
+ and more data pages etc.
+
+ For information about the bitmap page, see ma_bitmap.c
+
+ Structure of data and tail page:
+
+ The page has a row directory at end of page to allow us to do deletes
+ without having to reorganize the page. It also allows us to later store
+ some more bytes after each row to allow them to grow without having to move
+ around other rows.
+
+ Page header:
+
+ LSN 7 bytes Log position for last page change
+ PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob
+ NO 1 uchar Number of row/tail entries on page
+ empty space 2 bytes Empty space on page
+
+ The most significant bit in PAGE_TYPE is set to 1 if the data on the page
+ can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
+
+ Row data
+
+ Row directory of NO entries, that consist of the following for each row
+ (in reverse order; i.e., first record is stored last):
+
+ Position 2 bytes Position of row on page
+ Length 2 bytes Length of entry
+
+ For Position and Length, the 1 most significant bit of the position and
+ the 1 most significant bit of the length could be used for some states of
+ the row (in other words, we should try to keep these reserved)
+
+ eof flag 1 uchar Reserved for full page read testing. (Ie, did the
+ previous write get the whole block on disk.
+
+ ----------------
+
+ Structure of blob pages:
+
+ LSN 7 bytes Log position for last page change
+ PAGE_TYPE 1 uchar 3
+
+ data
+
+ -----------------
+
+ Row data structure:
+
+ Flag 1 uchar Marker of which header field exists
+ TRANSID 6 bytes TRANSID of changing transaction
+ (optional, added on insert and first
+ update/delete)
+ VER_PTR 7 bytes Pointer to older version in log
+ (undo record)
+ (optional, added after first
+ update/delete)
+ DELETE_TRANSID 6 bytes (optional). TRANSID of original row.
+ Added on delete.
+ Nulls_extended 1 uchar To allow us to add new DEFAULT NULL
+ fields (optional, added after first
+ change of row after alter table)
+ Number of ROW_EXTENT's 1-3 uchar Length encoded, optional
+ This is the number of extents the
+ row is split into
+ First row_extent 7 uchar Pointer to first row extent (optional)
+
+ Total length of length array 1-3 uchar Only used if we have
+ char/varchar/blob fields.
+ Row checksum 1 uchar Only if table created with checksums
+ Null_bits .. One bit for each NULL field (a field that may
+ have the value NULL)
+ Empty_bits .. One bit for each field that may be 'empty'.
+ (Both for null and not null fields).
+ This bit is 1 if the value for the field is
+ 0 or empty string.
+
+ field_offsets 2 byte/offset
+ For each 32'th field, there is one offset
+ that points to where the field information
+ starts in the block. This is to provide
+ fast access to later field in the row
+ when we only need to return a small
+ set of fields.
+ TODO: Implement this.
+
+ Things marked above as 'optional' will only be present if the
+ corresponding bit is set in 'Flag' field. Flag gives us a way to
+ get more space on a page when doing page compaction as we don't need
+ to store TRANSID that have committed before the smallest running
+ transaction we have in memory.
+
+ Data in the following order:
+ (Field order is precalculated when table is created)
+
+ Critical fixed length, not null, fields. (Note, these can't be dropped)
+ Fixed length, null fields
+
+ Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
+ Number of bytes used in length array per entry is depending on max length
+ for field.
+
+ ROW_EXTENT's
+ CHAR data (space stripped)
+ VARCHAR data
+ BLOB data
+
+ Fields marked in null_bits or empty_bits are not stored in data part or
+ length array.
+
+ If row doesn't fit into the given block, then the first EXTENT will be
+ stored last on the row. This is done so that we don't break any field
+ data in the middle.
+
+ We first try to store the full row into one block. If that's not possible
+ we move out each big blob into their own extents. If this is not enough we
+ move out a concatenation of all varchars to their own extent.
+
+ Each blob and the concatenated char/varchar fields are stored the following
+ way:
+ - Store the parts in as many full-contiguous pages as possible.
+ - The last part, that doesn't fill a full page, is stored in tail page.
+
+ When doing an insert of a new row, we don't have to have
+ VER_PTR in the row. This will make rows that are not changed stored
+ efficiently. On update and delete we would add TRANSID (if it was an old
+ committed row) and VER_PTR to
+ the row. On row page compaction we can easily detect rows where
+ TRANSID was committed before the longest running transaction
+ started and we can then delete TRANSID and VER_PTR from the row to
+ gain more space.
+
+ If a row is deleted in Maria, we change TRANSID to the deleting
+ transaction's id, change VER_PTR to point to the undo record for the delete,
+ and add DELETE_TRANSID (the id of the transaction which last
+ inserted/updated the row before its deletion). DELETE_TRANSID allows an old
+ transaction to avoid reading the log to know if it can see the last version
+ before delete (in other words it reduces the probability of having to follow
+ VER_PTR). TODO: depending on a compilation option, evaluate the performance
+ impact of not storing DELETE_TRANSID (which would make the row smaller).
+
+ Description of the different parts:
+
+ Flag is coded as:
+
+ Description bit
+ TRANS_ID_exists 0
+ VER_PTR_exists 1
+ Row is deleted 2 (Means that DELETE_TRANSID exists)
+ Nulls_extended_exists 3
+ Row is split 7 This means that 'Number_of_row_extents' exists
+
+ Nulls_extended is the number of new DEFAULT NULL fields in the row
+ compared to the number of DEFAULT NULL fields when the first version
+ of the table was created. If Nulls_extended doesn't exist in the row,
+ we know it's 0 as this must be one of the original rows from when the
+ table was created first time. This coding allows us to add 255*8 =
+ 2048 new fields without requiring a full alter table.
+
+ Empty_bits is used to allow us to store 0, 0.0, empty string, empty
+ varstring and empty blob efficiently. (This is very good for data
+ warehousing where NULL's are often regarded as evil). Having this
+ bitmap also allows us to drop information of a field during a future
+ delete if field was deleted with ALTER TABLE DROP COLUMN. To be able
+ to handle DROP COLUMN, we must store in the index header the fields
+ that has been dropped. When unpacking a row we will ignore dropped
+ fields. When storing a row, we will mark a dropped field either with a
+ null in the null bit map or in the empty_bits and not store any data
+ for it.
+ TODO: Add code for handling dropped fields.
+
+
+ A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
+
+ START_PAGE 5 bytes
+ PAGE_COUNT 2 bytes. High bit is used to indicate tail page/
+ end of blob
+ With 8K pages, we can cover 256M in one extent. This coding gives us a
+ maximum file size of 2^40*8192 = 8192 tera
+
+ As an example of ROW_EXTENT handling, assume a row with one integer
+ field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
+ big BLOB fields that we have updated.
+
+ The record format for storing this into an empty file would be:
+
+ Page 1:
+
+ 00 00 00 00 00 00 00 LSN
+ 01 Only one row in page
+ xx xx Empty space on page
+
+ 10 Flag: row split, VER_PTR exists
+ 01 00 00 00 00 00 TRANSID 1
+ 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1
+ 5 Number of row extents
+ 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4
+ 0 No null fields
+ 0 No empty fields
+ 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0
+ 06 00 00 00 00 80 00 First blob, stored at page 6-133
+ 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5
+ 86 00 00 00 00 80 00 Second blob, stored at page 134-262
+ 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5
+ 05 00 5 integer
+ FA Length of first varchar field (size 250)
+ 00 60 Length of second varchar field (size 8192*3)
+ 00 60 10 First medium BLOB, 1M
+ 01 00 10 00 Second BLOB, 1M
+ xx xx xx xx xx xx Varchars are stored here until end of page
+
+ ..... until end of page
+
+ 09 00 F4 1F 00 (Start position 9, length 8180, end byte)
+*/
+
+#define SANITY_CHECKS
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include <lf.h>
+#include "trnman.h"
+
+/*
+ Struct for having a cursor over a set of extent.
+ This is used to loop over all extents for a row when reading
+ the row data. It's also used to store the tail positions for
+ a read row to be used by a later update/delete command.
+*/
+
+typedef struct st_maria_extent_cursor
+{
+ /*
+ Pointer to packed uchar array of extents for the row.
+ Format is described above in the header
+ */
+ uchar *extent;
+ /* Where data starts on page; Only for debugging */
+ uchar *data_start;
+ /* Position to all tails in the row. Updated when reading a row */
+ MARIA_RECORD_POS *tail_positions;
+ /* Current page */
+ ulonglong page;
+ /* How many pages in the page region */
+ uint page_count;
+ /* What kind of lock to use for tail pages */
+ enum pagecache_page_lock lock_for_tail_pages;
+ /* Total number of extents (i.e., entries in the 'extent' slot) */
+ uint extent_count;
+ /* <> 0 if current extent is a tail page; Set while using cursor */
+ uint tail;
+ /* Position for tail on tail page */
+ uint tail_row_nr;
+ /*
+ == 1 if we are working on the first extent (i.e., the one that is stored in
+ the row header, not an extent that is stored as part of the row data).
+ */
+ my_bool first_extent;
+} MARIA_EXTENT_CURSOR;
+
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
+static my_bool delete_head_or_tail(MARIA_HA *info,
+ ulonglong page, uint record_number,
+ my_bool head, my_bool from_update);
+#ifndef DBUG_OFF
+static void _ma_print_directory(uchar *buff, uint block_size);
+#endif
+static void compact_page(uchar *buff, uint block_size, uint rownr,
+ my_bool extend_block);
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+ uint block_size, ulong length);
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+ LEX_STRING *log_parts,
+ uint *log_parts_count);
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+ const uchar *newrec,
+ LEX_STRING *log_parts,
+ uint *log_parts_count);
+
+/****************************************************************************
+ Initialization
+****************************************************************************/
+
+/*
+ Initialize data needed for block structures
+*/
+
+
+/* Size of the different header elements for a row */
+
+static uchar header_sizes[]=
+{
+ TRANSID_SIZE,
+ VERPTR_SIZE,
+ TRANSID_SIZE, /* Delete transid */
+ 1 /* Null extends */
+};
+
+/*
+ Calculate array of all used headers
+
+ Used to speed up:
+
+ size= 1;
+ if (flag & 1)
+ size+= TRANSID_SIZE;
+ if (flag & 2)
+ size+= VERPTR_SIZE;
+ if (flag & 4)
+ size+= TRANSID_SIZE
+ if (flag & 8)
+ size+= 1;
+
+ NOTES
+ This is called only once at startup of Maria
+*/
+
+static uchar total_header_size[1 << array_elements(header_sizes)];
+#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
+
+void _ma_init_block_record_data(void)
+{
+ uint i;
+ bzero(total_header_size, sizeof(total_header_size));
+ total_header_size[0]= FLAG_SIZE; /* Flag uchar */
+ for (i= 1; i < array_elements(total_header_size); i++)
+ {
+ uint size= FLAG_SIZE, j, bit;
+ for (j= 0; (bit= (1 << j)) <= i; j++)
+ {
+ if (i & bit)
+ size+= header_sizes[j];
+ }
+ total_header_size[i]= size;
+ }
+}
+
+
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
+{
+
+ share->base.max_data_file_length=
+ (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) *
+ share->block_size;
+#if SIZEOF_OFF_T == 4
+ set_if_smaller(share->base.max_data_file_length, INT_MAX32);
+#endif
+ return _ma_bitmap_init(share, data_file);
+}
+
+
+my_bool _ma_once_end_block_record(MARIA_SHARE *share)
+{
+ int res= _ma_bitmap_end(share);
+ if (share->bitmap.file.file >= 0)
+ {
+ if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
+ share->temporary ? FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE))
+ res= 1;
+ /*
+ File must be synced as it is going out of the maria_open_list and so
+ becoming unknown to Checkpoint.
+ */
+ if (share->now_transactional &&
+ my_sync(share->bitmap.file.file, MYF(MY_WME)))
+ res= 1;
+ if (my_close(share->bitmap.file.file, MYF(MY_WME)))
+ res= 1;
+ /*
+ Trivial assignment to guard against multiple invocations
+ (May happen if file are closed but we want to keep the maria object
+ around a bit longer)
+ */
+ share->bitmap.file.file= -1;
+ }
+ if (share->id != 0)
+ translog_deassign_id_from_share(share);
+ return res;
+}
+
+
+/* Init info->cur_row structure */
+
+my_bool _ma_init_block_record(MARIA_HA *info)
+{
+ MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
+ DBUG_ENTER("_ma_init_block_record");
+
+ if (!my_multi_malloc(MY_WME,
+ &row->empty_bits, info->s->base.pack_bytes,
+ &row->field_lengths,
+ info->s->base.max_field_lengths + 2,
+ &row->blob_lengths, sizeof(ulong) * info->s->base.blobs,
+ &row->null_field_lengths, (sizeof(uint) *
+ (info->s->base.fields -
+ info->s->base.blobs +
+ EXTRA_LENGTH_FIELDS)),
+ &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
+ (info->s->base.blobs + 2)),
+ &new_row->empty_bits, info->s->base.pack_bytes,
+ &new_row->field_lengths,
+ info->s->base.max_field_lengths + 2,
+ &new_row->blob_lengths,
+ sizeof(ulong) * info->s->base.blobs,
+ &new_row->null_field_lengths, (sizeof(uint) *
+ (info->s->base.fields -
+ info->s->base.blobs +
+ EXTRA_LENGTH_FIELDS)),
+ &info->log_row_parts,
+ sizeof(*info->log_row_parts) *
+ (TRANSLOG_INTERNAL_PARTS + 2 +
+ info->s->base.fields + 3),
+ &info->update_field_data,
+ (info->s->base.fields * 4 +
+ info->s->base.max_field_lengths + 1 + 4),
+ NullS, 0))
+ DBUG_RETURN(1);
+ /* Skip over bytes used to store length of field length for logging */
+ row->field_lengths+= 2;
+ new_row->field_lengths+= 2;
+ if (my_init_dynamic_array(&info->bitmap_blocks,
+ sizeof(MARIA_BITMAP_BLOCK),
+ ELEMENTS_RESERVED_FOR_MAIN_PART, 16))
+ goto err;
+ /* The following should be big enough for all purposes */
+ if (my_init_dynamic_array(&info->pinned_pages,
+ sizeof(MARIA_PINNED_PAGE),
+ max(info->s->base.blobs*2 + 4,
+ MARIA_MAX_TREE_LEVELS*2), 16))
+ goto err;
+ row->base_length= new_row->base_length= info->s->base_length;
+
+ /*
+ We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
+ null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
+ */
+
+ row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+ new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+
+ DBUG_RETURN(0);
+
+err:
+ _ma_end_block_record(info);
+ DBUG_RETURN(1);
+}
+
+
+void _ma_end_block_record(MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_end_block_record");
+ my_free((uchar*) info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR));
+ delete_dynamic(&info->bitmap_blocks);
+ delete_dynamic(&info->pinned_pages);
+ my_free((uchar*) info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR));
+ /*
+ The data file is closed, when needed, in ma_once_end_block_record().
+ The following protects us from doing an extra, not allowed, close
+ in maria_close()
+ */
+ info->dfile.file= -1;
+ DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+ Helper functions
+****************************************************************************/
+
+/*
+ Return the next unused postion on the page after a directory entry.
+
+ SYNOPSIS
+ start_of_next_entry()
+ dir Directory entry to be used. This can not be the
+ the last entry on the page!
+
+ RETURN
+ # Position in page where next entry starts.
+ Everything between the '*dir' and this are free to be used.
+*/
+
+static inline uint start_of_next_entry(uchar *dir)
+{
+ uchar *prev;
+ /*
+ Find previous used entry. (There is always a previous entry as
+ the directory never starts with a deleted entry)
+ */
+ for (prev= dir - DIR_ENTRY_SIZE ;
+ prev[0] == 0 && prev[1] == 0 ;
+ prev-= DIR_ENTRY_SIZE)
+ {}
+ return (uint) uint2korr(prev);
+}
+
+
+/*
+ Return the offset where the previous entry ends (before on page)
+
+ SYNOPSIS
+ end_of_previous_entry()
+ dir Address for current directory entry
+ end Address to last directory entry
+
+ RETURN
+ # Position where previous entry ends (smallest address on page)
+ Everything between # and current entry are free to be used.
+*/
+
+
+static inline uint end_of_previous_entry(uchar *dir, uchar *end)
+{
+ uchar *pos;
+ for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
+ {
+ uint offset;
+ if ((offset= uint2korr(pos)))
+ return offset + uint2korr(pos+2);
+ }
+ return PAGE_HEADER_SIZE;
+}
+
+
+/**
+ @brief Extend a record area to fit a given size block
+
+ @fn extend_area_on_page()
+ @param buff Page buffer
+ @param dir Pointer to dir entry in buffer
+ @param rownr Row number we working on
+ @param block_size Block size of buffer
+ @param request_length How much data we want to put at [dir]
+ @param empty_space Total empty space in buffer
+
+ IMPLEMENTATION
+ The logic is as follows (same as in _ma_update_block_record())
+ - If new data fits in old block, use old block.
+ - Extend block with empty space before block. If enough, use it.
+ - Extend block with empty space after block. If enough, use it.
+ - Use compact_page() to get all empty space at dir.
+
+ RETURN
+ @retval 0 ok
+ @retval ret_offset Pointer to store offset to found area
+ @retval ret_length Pointer to store length of found area
+ @retval [dir] rec_offset is store here too
+
+ @retval 1 error (wrong info in block)
+*/
+
+static my_bool extend_area_on_page(uchar *buff, uchar *dir,
+ uint rownr, uint block_size,
+ uint request_length,
+ uint *empty_space, uint *ret_offset,
+ uint *ret_length)
+{
+ uint rec_offset, length;
+ DBUG_ENTER("extend_area_on_page");
+
+ rec_offset= uint2korr(dir);
+ length= uint2korr(dir + 2);
+ DBUG_PRINT("enter", ("rec_offset: %u length: %u request_length: %u",
+ rec_offset, length, request_length));
+
+ *empty_space+= length;
+ if (length < request_length)
+ {
+ uint max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET];
+ uint old_rec_offset;
+ /*
+ New data did not fit in old position.
+ Find first possible position where to put new data.
+ */
+ old_rec_offset= rec_offset;
+ rec_offset= end_of_previous_entry(dir, buff + block_size -
+ PAGE_SUFFIX_SIZE);
+ length+= (uint) (old_rec_offset - rec_offset);
+ /*
+ old_rec_offset is 0 if we are doing an insert into a not allocated block.
+ This can only happen during REDO of INSERT
+ */
+ if (!old_rec_offset || length < request_length)
+ {
+ /*
+ Did not fit in current block + empty space. Extend with
+ empty space after block.
+ */
+ if (rownr == max_entry - 1)
+ {
+ /* Last entry; Everything is free between this and directory */
+ length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
+ rec_offset);
+ }
+ else
+ length= start_of_next_entry(dir) - rec_offset;
+ DBUG_ASSERT((int) length > 0);
+ if (length < request_length)
+ {
+ /* Not enough continues space, compact page to get more */
+ int2store(dir, rec_offset);
+ compact_page(buff, block_size, rownr, 1);
+ rec_offset= uint2korr(dir);
+ length= uint2korr(dir+2);
+ if (length < request_length)
+ DBUG_RETURN(1); /* Error in block */
+ *empty_space= length; /* All space is here */
+ }
+ }
+ }
+ int2store(dir, rec_offset);
+ *ret_offset= rec_offset;
+ *ret_length= length;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check that a region is all zero
+
+ SYNOPSIS
+ check_if_zero()
+ pos Start of memory to check
+ length length of memory region
+
+ NOTES
+ Used mainly to detect rows with wrong extent information
+*/
+
+static my_bool check_if_zero(uchar *pos, uint length)
+{
+ uchar *end;
+ for (end= pos+ length; pos != end ; pos++)
+ if (pos[0] != 0)
+ return 1;
+ return 0;
+}
+
+
+/*
+ @brief Copy not changed fields from 'from' to 'to'
+
+ @notes
+ Assumption is that most fields are not changed!
+ (Which is why we don't test if all bits are set for some bytes in bitmap)
+*/
+
+void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
+ uchar *to, uchar *from)
+{
+ MARIA_COLUMNDEF *column, *end_column;
+ uchar *bitmap= (uchar*) changed_fields->bitmap;
+ MARIA_SHARE *share= info->s;
+ uint bit= 1;
+
+ for (column= share->columndef, end_column= column+ share->base.fields;
+ column < end_column; column++)
+ {
+ if (!(*bitmap & bit))
+ {
+ uint field_length= column->length;
+ if (column->type == FIELD_VARCHAR)
+ {
+ if (column->fill_length == 1)
+ field_length= (uint) from[column->offset] + 1;
+ else
+ field_length= uint2korr(from + column->offset) + 2;
+ }
+ memcpy(to + column->offset, from + column->offset, field_length);
+ }
+ if ((bit= (bit << 1)) == 256)
+ {
+ bitmap++;
+ bit= 1;
+ }
+ }
+}
+
+
+/*
+ Unpin all pinned pages
+
+ SYNOPSIS
+ _ma_unpin_all_pages()
+ info Maria handler
+ undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write undo
+ (error)
+
+ NOTE
+ We unpin pages in the reverse order as they where pinned; This may not
+ be strictly necessary but may simplify things in the future.
+
+ RETURN
+ 0 ok
+ 1 error (fatal disk error)
+
+*/
+
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn)
+{
+ MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+ dynamic_array_ptr(&info->pinned_pages, 0));
+ MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements;
+ DBUG_ENTER("_ma_unpin_all_pages");
+ DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn));
+
+ /* True if not disk error */
+ DBUG_ASSERT((undo_lsn != LSN_IMPOSSIBLE) || !info->s->now_transactional);
+
+ if (!info->s->now_transactional)
+ undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */
+
+ while (pinned_page-- != page_link)
+ pagecache_unlock_by_link(info->s->pagecache, pinned_page->link,
+ pinned_page->unlock, PAGECACHE_UNPIN,
+ info->trn->rec_lsn, undo_lsn);
+
+ info->pinned_pages.elements= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+#ifdef NOT_YET_NEEDED
+/* Calculate empty space on a page */
+
+static uint empty_space_on_page(uchar *buff, uint block_size)
+{
+ enum en_page_type;
+ page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
+ ~(uchar) PAGE_CAN_BE_COMPACTED);
+ if (page_type == UNALLOCATED_PAGE)
+ return block_size;
+ if ((uint) page_type <= TAIL_PAGE)
+ return uint2korr(buff+EMPTY_SPACE_OFFSET);
+ return 0; /* Blob page */
+}
+#endif
+
+/**
+ When we have finished the write/update/delete of a row, we have cleanups to
+ do. For now it is signalling to Checkpoint that all dirtied pages have
+ their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called),
+ and that bitmap pages are correct (_ma_bitmap_release_unused() has been
+ called).
+*/
+#define _ma_finalize_row(info) \
+ do { info->trn->rec_lsn= LSN_IMPOSSIBLE; } while(0)
+/** unpinning is often the last operation before finalizing: */
+#define _ma_unpin_all_pages_and_finalize_row(info,undo_lsn) do \
+ { \
+ _ma_unpin_all_pages(info, undo_lsn); \
+ _ma_finalize_row(info); \
+ } while(0)
+
+
+/*
+ Find free position in directory
+
+ SYNOPSIS
+ find_free_position()
+ buff Page
+ block_size Size of page
+ res_rownr Store index to free position here
+ res_length Store length of found segment here
+ empty_space Store length of empty space on disk here. This is
+ all empty space, including the found block.
+
+ NOTES
+ If there is a free directory entry (entry with position == 0),
+ then use it and change it to be the size of the empty block
+ after the previous entry. This guarantees that all row entries
+ are stored on disk in inverse directory order, which makes life easier for
+ 'compact_page()' and to know if there is free space after any block.
+
+ If there is no free entry (entry with position == 0), then we create
+ a new one. If there is not space for the directory entry (because
+ the last block overlapps with the directory), we compact the page.
+
+ We will update the offset and the length of the found dir entry to
+ match the position and empty space found.
+
+ buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
+
+ RETURN
+ 0 Error (directory full or last block goes over directory)
+ # Pointer to directory entry on page
+*/
+
+static uchar *find_free_position(uchar *buff, uint block_size, uint *res_rownr,
+ uint *res_length, uint *empty_space)
+{
+ uint max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET];
+ uint entry, length, first_pos;
+ uchar *dir, *end;
+ DBUG_ENTER("find_free_position");
+ DBUG_PRINT("info", ("max_entry: %u", max_entry));
+
+ dir= (buff + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE);
+ end= buff + block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+
+ *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+ /* Search after first empty position */
+ first_pos= PAGE_HEADER_SIZE;
+ for (entry= 0 ; dir <= end ; end-= DIR_ENTRY_SIZE, entry++)
+ {
+ uint tmp= uint2korr(end);
+ if (!tmp) /* Found not used entry */
+ {
+ length= start_of_next_entry(end) - first_pos;
+ int2store(end, first_pos); /* Update dir entry */
+ int2store(end + 2, length);
+ *res_rownr= entry;
+ *res_length= length;
+ DBUG_RETURN(end);
+ }
+ first_pos= tmp + uint2korr(end + 2);
+ }
+ /* No empty places in dir; create a new one */
+ dir= end;
+ /* Check if there is place for the directory entry */
+ if (max_entry == MAX_ROWS_PER_PAGE)
+ DBUG_RETURN(0);
+ /* Check if there is place for the directory entry */
+ if ((uint) (dir - buff) < first_pos)
+ {
+ /* Create place for directory */
+ compact_page(buff, block_size, max_entry-1, 0);
+ first_pos= (uint2korr(end + DIR_ENTRY_SIZE) +
+ uint2korr(end + DIR_ENTRY_SIZE+ 2));
+ *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ }
+ buff[DIR_COUNT_OFFSET]= (uchar) (uchar) max_entry+1;
+ length= (uint) (dir - buff - first_pos);
+ DBUG_ASSERT(length <= *empty_space - DIR_ENTRY_SIZE);
+ int2store(dir, first_pos);
+ int2store(dir+2, length); /* Current max length */
+ *res_rownr= max_entry;
+ *res_length= length;
+
+ /* Reduce directory entry size from free space size */
+ (*empty_space)-= DIR_ENTRY_SIZE;
+ DBUG_RETURN(dir);
+}
+
+
+/****************************************************************************
+ Updating records
+****************************************************************************/
+
+/*
+ Calculate length of all the different field parts
+
+ SYNOPSIS
+ calc_record_size()
+ info Maria handler
+ record Row to store
+ row Store statistics about row here
+
+ NOTES
+ The statistics is used to find out how much space a row will need
+ and also where we can split a row when we need to split it into several
+ extents.
+*/
+
+static void calc_record_size(MARIA_HA *info, const uchar *record,
+ MARIA_ROW *row)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *field_length_data;
+ MARIA_COLUMNDEF *column, *end_column;
+ uint *null_field_lengths= row->null_field_lengths;
+ ulong *blob_lengths= row->blob_lengths;
+ DBUG_ENTER("calc_record_size");
+
+ row->normal_length= row->char_length= row->varchar_length=
+ row->blob_length= row->extents_count= 0;
+
+ /* Create empty bitmap and calculate length of each varlength/char field */
+ bzero(row->empty_bits, share->base.pack_bytes);
+ field_length_data= row->field_lengths;
+ for (column= share->columndef + share->base.fixed_not_null_fields,
+ end_column= share->columndef + share->base.fields;
+ column < end_column; column++, null_field_lengths++)
+ {
+ if ((record[column->null_pos] & column->null_bit))
+ {
+ if (column->type != FIELD_BLOB)
+ *null_field_lengths= 0;
+ else
+ *blob_lengths++= 0;
+ continue;
+ }
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ DBUG_ASSERT(column->empty_bit == 0);
+ /* fall through */
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ row->normal_length+= column->length;
+ *null_field_lengths= column->length;
+ break;
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ if (memcmp(record+ column->offset, maria_zero_string,
+ column->length) == 0)
+ {
+ row->empty_bits[column->empty_pos] |= column->empty_bit;
+ *null_field_lengths= 0;
+ }
+ else
+ {
+ row->normal_length+= column->length;
+ *null_field_lengths= column->length;
+ }
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ const char *pos, *end;
+ for (pos= record + column->offset, end= pos + column->length;
+ end > pos && end[-1] == ' '; end--)
+ ;
+ if (pos == end) /* If empty string */
+ {
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ *null_field_lengths= 0;
+ }
+ else
+ {
+ uint length= (end - pos);
+ if (column->length <= 255)
+ *field_length_data++= (uchar) length;
+ else
+ {
+ int2store(field_length_data, length);
+ field_length_data+= 2;
+ }
+ row->char_length+= length;
+ *null_field_lengths= length;
+ }
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint length, field_length_data_length;
+ const uchar *field_pos= record + column->offset;
+
+ /* 256 is correct as this includes the length uchar */
+ field_length_data[0]= field_pos[0];
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) *field_pos;
+ field_length_data_length= 1;
+ }
+ else
+ {
+ length= uint2korr(field_pos);
+ field_length_data[1]= field_pos[1];
+ field_length_data_length= 2;
+ }
+ *null_field_lengths= length;
+ if (!length)
+ {
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ break;
+ }
+ row->varchar_length+= length;
+ *null_field_lengths= length;
+ field_length_data+= field_length_data_length;
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ const uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+ *blob_lengths++= blob_length;
+ if (!blob_length)
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ else
+ {
+ row->blob_length+= blob_length;
+ memcpy(field_length_data, field_pos, size_length);
+ field_length_data+= size_length;
+ }
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
+ row->head_length= (row->base_length +
+ share->base.fixed_not_null_fields_length +
+ row->field_lengths_length +
+ size_to_store_key_length(row->field_lengths_length) +
+ row->normal_length +
+ row->char_length + row->varchar_length);
+ row->total_length= (row->head_length + row->blob_length);
+ if (row->total_length < share->base.min_row_length)
+ row->total_length= share->base.min_row_length;
+ DBUG_PRINT("exit", ("head_length: %lu total_length: %lu",
+ (ulong) row->head_length, (ulong) row->total_length));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Compact page by removing all space between rows
+
+ IMPLEMENTATION
+ Move up all rows to start of page.
+ Move blocks that are directly after each other with one memmove.
+
+ TODO LATER
+ Remove TRANSID from rows that are visible to all transactions
+
+ SYNOPSIS
+ compact_page()
+ buff Page to compact
+ block_size Size of page
+ rownr Put empty data after this row
+ extend_block If 1, extend the block at 'rownr' to cover the
+ whole block.
+*/
+
+
+static void compact_page(uchar *buff, uint block_size, uint rownr,
+ my_bool extend_block)
+{
+ uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET];
+ uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
+ uchar *dir, *end;
+ DBUG_ENTER("compact_page");
+ DBUG_PRINT("enter", ("rownr: %u", rownr));
+ DBUG_ASSERT(max_entry > 0 &&
+ max_entry < (block_size - PAGE_HEADER_SIZE -
+ PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
+
+ /* Move all entries before and including rownr up to start of page */
+ dir= buff + block_size - DIR_ENTRY_SIZE * (rownr+1) - PAGE_SUFFIX_SIZE;
+ end= buff + block_size - DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE;
+ page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE;
+ diff= 0;
+ for (; dir <= end ; end-= DIR_ENTRY_SIZE)
+ {
+ uint offset= uint2korr(end);
+
+ if (offset)
+ {
+ uint row_length= uint2korr(end + 2);
+ DBUG_ASSERT(offset >= page_pos);
+ DBUG_ASSERT(buff + offset + row_length <= dir);
+
+ if (offset != next_free_pos)
+ {
+ uint length= (next_free_pos - start_of_found_block);
+ /*
+ There was empty space before this and prev block
+ Check if we have to move previous block up to page start
+ */
+ if (page_pos != start_of_found_block)
+ {
+ /* move up previous block */
+ memmove(buff + page_pos, buff + start_of_found_block, length);
+ }
+ page_pos+= length;
+ /* next continuous block starts here */
+ start_of_found_block= offset;
+ diff= offset - page_pos;
+ }
+ int2store(end, offset - diff); /* correct current pos */
+ next_free_pos= offset + row_length;
+ }
+ }
+ if (page_pos != start_of_found_block)
+ {
+ uint length= (next_free_pos - start_of_found_block);
+ memmove(buff + page_pos, buff + start_of_found_block, length);
+ }
+ start_of_found_block= uint2korr(dir);
+
+ if (rownr != max_entry - 1)
+ {
+ /* Move all entries after rownr to end of page */
+ uint rownr_length;
+ next_free_pos= end_of_found_block= page_pos=
+ block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
+ diff= 0;
+ /* End points to entry before 'rownr' */
+ for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
+ {
+ uint offset= uint2korr(dir);
+ uint row_length= uint2korr(dir + 2);
+ uint row_end= offset + row_length;
+ if (!offset)
+ continue;
+ DBUG_ASSERT(offset >= start_of_found_block && row_end <= next_free_pos);
+
+ if (row_end != next_free_pos)
+ {
+ uint length= (end_of_found_block - next_free_pos);
+ if (page_pos != end_of_found_block)
+ {
+ /* move next block down */
+ memmove(buff + page_pos - length, buff + next_free_pos, length);
+ }
+ page_pos-= length;
+ /* next continuous block starts here */
+ end_of_found_block= row_end;
+ diff= page_pos - row_end;
+ }
+ int2store(dir, offset + diff); /* correct current pos */
+ next_free_pos= offset;
+ }
+ if (page_pos != end_of_found_block)
+ {
+ uint length= (end_of_found_block - next_free_pos);
+ memmove(buff + page_pos - length, buff + next_free_pos, length);
+ next_free_pos= page_pos- length;
+ }
+ /* Extend rownr block to cover hole */
+ rownr_length= next_free_pos - start_of_found_block;
+ int2store(dir+2, rownr_length);
+ }
+ else
+ {
+ if (extend_block)
+ {
+ /* Extend last block cover whole page */
+ uint length= (uint) (dir - buff) - start_of_found_block;
+ int2store(dir+2, length);
+ }
+ else
+ {
+ /*
+ TODO:
+ Update (buff + EMPTY_SPACE_OFFSET) if we remove transid from rows
+ */
+ }
+ buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
+ }
+ DBUG_EXECUTE("directory", _ma_print_directory(buff, block_size););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Create an empty tail or head page
+
+ SYNOPSIS
+ make_empty_page()
+ buff Page buffer
+ block_size Block size
+ page_type HEAD_PAGE or TAIL_PAGE
+
+ NOTES
+ EMPTY_SPACE is not updated
+*/
+
+static void make_empty_page(uchar *buff, uint block_size, uint page_type)
+{
+
+ bzero(buff, PAGE_HEADER_SIZE);
+ /*
+ We zero the rest of the block to avoid getting old memory information
+ to disk and to allow the file to be compressed better if archived.
+ The rest of the code does not assume the block is zeroed above
+ PAGE_OVERHEAD_SIZE
+ */
+ bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE);
+ buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
+ buff[DIR_COUNT_OFFSET]= 1;
+ /* Store position to the first row */
+ int2store(buff + block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE,
+ PAGE_HEADER_SIZE);
+}
+
+
+/*
+ Read or initialize new head or tail page
+
+ SYNOPSIS
+ get_head_or_tail_page()
+ info Maria handler
+ block Block to read
+ buff Suggest this buffer to key cache
+ length Minimum space needed
+ page_type HEAD_PAGE || TAIL_PAGE
+ res Store result position here
+
+ NOTES
+ We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
+ as we don't know how much data the caller will actually use.
+
+ RETURN
+ 0 ok All slots in 'res' are updated
+ 1 error my_errno is set
+*/
+
+struct st_row_pos_info
+{
+ uchar *buff; /* page buffer */
+ uchar *data; /* Place for data */
+ uchar *dir; /* Directory */
+ uint length; /* Length for data */
+ uint rownr; /* Offset in directory */
+ uint empty_space; /* Space left on page */
+};
+
+
+static my_bool get_head_or_tail_page(MARIA_HA *info,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *buff, uint length, uint page_type,
+ enum pagecache_page_lock lock,
+ struct st_row_pos_info *res)
+{
+ uint block_size;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("get_head_or_tail_page");
+ DBUG_PRINT("enter", ("length: %u", length));
+
+ block_size= share->block_size;
+ if (block->org_bitmap_value == 0) /* Empty block */
+ {
+ /* New page */
+ make_empty_page(buff, block_size, page_type);
+ res->buff= buff;
+ res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE);
+ res->data= (buff + PAGE_HEADER_SIZE);
+ res->dir= res->data + res->length;
+ res->rownr= 0;
+ DBUG_ASSERT(length <= res->length);
+ }
+ else
+ {
+ uchar *dir;
+ /* Read old page */
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (!(res->buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ (my_off_t) block->page, 0,
+ buff, share->page_type,
+ lock, &page_link.link)))
+ DBUG_RETURN(1);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ DBUG_ASSERT((res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type);
+ if (!(dir= find_free_position(res->buff, block_size, &res->rownr,
+ &res->length, &res->empty_space)))
+ goto crashed;
+
+ if (res->length < length)
+ {
+ if (res->empty_space + res->length >= length)
+ {
+ compact_page(res->buff, block_size, res->rownr, 1);
+ /* All empty space are now after current position */
+ dir= (res->buff + block_size - DIR_ENTRY_SIZE * res->rownr -
+ DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE);
+ res->length= res->empty_space= uint2korr(dir+2);
+ }
+ if (res->length < length)
+ {
+ DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u",
+ length, res->length, res->empty_space));
+ goto crashed; /* Wrong bitmap information */
+ }
+ }
+ res->dir= dir;
+ res->data= res->buff + uint2korr(dir);
+ }
+ DBUG_RETURN(0);
+
+crashed:
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Write tail for head data or blob
+
+ SYNOPSIS
+ write_tail()
+ info Maria handler
+ block Block to tail page
+ row_part Data to write to page
+ length Length of data
+
+ NOTES
+ block->page_count is updated to the directory offset for the tail
+ so that we can store the position in the row extent information
+
+ RETURN
+ 0 ok
+ block->page_count is set to point (dir entry + TAIL_BIT)
+
+ 1 error; In this case my_errno is set to the error
+*/
+
+static my_bool write_tail(MARIA_HA *info,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *row_part, uint length)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE page_link;
+ uint block_size= share->block_size, empty_space;
+ struct st_row_pos_info row_pos;
+ my_off_t position;
+ my_bool res, block_is_read;
+ DBUG_ENTER("write_tail");
+ DBUG_PRINT("enter", ("page: %lu length: %u",
+ (ulong) block->page, length));
+
+ info->keyread_buff_used= 1;
+
+ /* page will be pinned & locked by get_head_or_tail_page */
+ if (get_head_or_tail_page(info, block, info->keyread_buff, length,
+ TAIL_PAGE, PAGECACHE_LOCK_WRITE,
+ &row_pos))
+ DBUG_RETURN(1);
+ block_is_read= block->org_bitmap_value != 0;
+
+ memcpy(row_pos.data, row_part, length);
+
+ {
+ /* Log changes in tail block */
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ LSN lsn;
+
+ /* Log REDO changes of tail page */
+ page_store(log_data + FILEID_STORE_SIZE, block->page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ row_pos.rownr);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) row_pos.data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+ if (translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_TAIL,
+ info->trn, info, sizeof(log_data) + length,
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data))
+ DBUG_RETURN(1);
+ }
+
+ /*
+ Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
+ some place to grow in the future)
+ */
+ if (length < MIN_TAIL_SIZE)
+ length= MIN_TAIL_SIZE;
+ int2store(row_pos.dir + 2, length);
+ empty_space= row_pos.empty_space - length;
+ int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
+ block->page_count= row_pos.rownr + TAIL_BIT;
+ /*
+ If there is less directory entries free than number of possible tails
+ we can write for a row, we mark the page full to ensure that we don't
+ during _ma_bitmap_find_place() allocate more entries on the tail page
+ than it can hold
+ */
+ block->empty_space= ((uint) ((uchar*) row_pos.buff)[DIR_COUNT_OFFSET] <=
+ MAX_ROWS_PER_PAGE - 1 - share->base.blobs ?
+ empty_space : 0);
+ block->used= BLOCKUSED_USED | BLOCKUSED_TAIL;
+
+ /* Increase data file size, if extended */
+ position= (my_off_t) block->page * block_size;
+ if (info->state->data_file_length <= position)
+ info->state->data_file_length= position + block_size;
+
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (!(res= pagecache_write(share->pagecache,
+ &info->dfile, block->page, 0,
+ row_pos.buff,share->page_type,
+ block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
+ PAGECACHE_LOCK_READ,
+ block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY, &page_link.link)))
+ {
+ page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ if (block_is_read)
+ {
+ /* Change the lock used when we read the page */
+ set_dynamic(&info->pinned_pages, (void*) &page_link,
+ info->pinned_pages.elements-1);
+ }
+ else
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Write full pages
+
+ SYNOPSIS
+ write_full_pages()
+ info Maria handler
+ lsn LSN for the undo record
+ block Where to write data
+ data Data to write
+ length Length of data
+
+ NOTES
+ Logging of the changes to the full pages are done in the caller
+ write_block_record().
+
+ RETURN
+ 0 ok
+ 1 error on write
+*/
+
+static my_bool write_full_pages(MARIA_HA *info,
+ LSN lsn,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *data, ulong length)
+{
+ my_off_t page;
+ MARIA_SHARE *share= info->s;
+ uint block_size= share->block_size;
+ uint data_size= FULL_PAGE_SIZE(block_size);
+ uchar *buff= info->keyread_buff;
+ uint page_count;
+ my_off_t position;
+ DBUG_ENTER("write_full_pages");
+ DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu",
+ (ulong) length, (ulong) block->page,
+ (ulong) block->page_count));
+ DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
+
+ info->keyread_buff_used= 1;
+ page= block->page;
+ page_count= block->page_count;
+
+ position= (my_off_t) (page + page_count) * block_size;
+ if (info->state->data_file_length < position)
+ info->state->data_file_length= position;
+
+ /* Increase data file size, if extended */
+
+ for (; length; data+= data_size)
+ {
+ uint copy_length;
+ if (!page_count--)
+ {
+ block++;
+ page= block->page;
+ page_count= block->page_count - 1;
+ DBUG_PRINT("info", ("page: %lu page_count: %lu",
+ (ulong) block->page, (ulong) block->page_count));
+
+ position= (page + page_count + 1) * block_size;
+ if (info->state->data_file_length < position)
+ info->state->data_file_length= position;
+ }
+ lsn_store(buff, lsn);
+ buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
+ copy_length= min(data_size, length);
+ memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length);
+ length-= copy_length;
+
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0))
+ DBUG_RETURN(1);
+ page++;
+ block->used= BLOCKUSED_USED;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Store ranges of full pages in compact format for logging
+
+ SYNOPSIS
+ store_page_range()
+ to Store data here
+ block Where pages are to be written
+ block_size block size
+ length Length of data to be written
+ Normally this is full pages, except for the last
+ tail block that may only partly fit the last page.
+
+ RETURN
+ # end position for 'to'
+*/
+
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+ uint block_size, ulong length)
+{
+ uint data_size= FULL_PAGE_SIZE(block_size);
+ ulong pages_left= (length + data_size -1) / data_size;
+ uint page_count;
+ DBUG_ENTER("store_page_range");
+
+ do
+ {
+ ulonglong page;
+ page= block->page;
+ page_count= block->page_count;
+ block++;
+ if (page_count > pages_left)
+ page_count= pages_left;
+
+ page_store(to, page);
+ to+= PAGE_STORE_SIZE;
+ pagerange_store(to, page_count);
+ to+= PAGERANGE_STORE_SIZE;
+ } while ((pages_left-= page_count));
+ DBUG_RETURN(to);
+}
+
+
+/*
+ Store packed extent data
+
+ SYNOPSIS
+ store_extent_info()
+ to Store first packed data here
+ row_extents_second_part Store rest here
+ first_block First block to store
+ count Number of blocks
+
+ NOTES
+ We don't have to store the position for the head block
+*/
+
+static void store_extent_info(uchar *to,
+ uchar *row_extents_second_part,
+ MARIA_BITMAP_BLOCK *first_block,
+ uint count)
+{
+ MARIA_BITMAP_BLOCK *block, *end_block;
+ uint copy_length;
+ my_bool first_found= 0;
+
+ for (block= first_block, end_block= first_block+count ;
+ block < end_block; block++)
+ {
+ /* The following is only false for marker blocks */
+ if (likely(block->used & BLOCKUSED_USED))
+ {
+ DBUG_ASSERT(block->page_count != 0);
+ page_store(to, block->page);
+ pagerange_store(to + PAGE_STORE_SIZE, block->page_count);
+ to+= ROW_EXTENT_SIZE;
+ if (!first_found)
+ {
+ first_found= 1;
+ to= row_extents_second_part;
+ }
+ }
+ }
+ copy_length= (count - 1) * ROW_EXTENT_SIZE;
+ /*
+ In some unlikely cases we have allocated to many blocks. Clear this
+ data.
+ */
+ bzero(to, (size_t) (row_extents_second_part + copy_length - to));
+}
+
+
+/*
+ Free regions of pages with logging
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
+{
+ uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ LSN lsn;
+ size_t extents_length= row->extents_count * ROW_EXTENT_SIZE;
+ DBUG_ENTER("free_full_pages");
+
+ pagerange_store(log_data + FILEID_STORE_SIZE,
+ row->extents_count);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row->extents;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
+ if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS, info->trn,
+ info, sizeof(log_data) + extents_length,
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data))
+ DBUG_RETURN(1);
+
+ DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
+ row->extents_count));
+}
+
+
+/*
+ Free one page range
+
+ NOTES
+ This is very similar to free_full_pages()
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool free_full_page_range(MARIA_HA *info, ulonglong page, uint count)
+{
+ my_bool res= 0;
+ DBUG_ENTER("free_full_page_range");
+
+ if (pagecache_delete_pages(info->s->pagecache, &info->dfile,
+ page, count, PAGECACHE_LOCK_WRITE, 0))
+ res= 1;
+
+ if (info->s->now_transactional)
+ {
+ LSN lsn;
+ /** @todo unify log_data's shape with delete_head_or_tail() */
+ uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ ROW_EXTENT_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ DBUG_ASSERT(info->trn->rec_lsn);
+ pagerange_store(log_data + FILEID_STORE_SIZE, 1);
+ page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ page);
+ int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ PAGE_STORE_SIZE, count);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data))
+ res= 1;
+
+ }
+ pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+ if (_ma_reset_full_page_bits(info, &info->s->bitmap, page,
+ count))
+ res= 1;
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Write a record to a (set of) pages
+
+ @fn write_block_record()
+ @param info Maria handler
+ @param old_record Original record in case of update; NULL in case of
+ insert
+ @param record Record we should write
+ @param row Statistics about record (calculated by
+ calc_record_size())
+ @param map_blocks On which pages the record should be stored
+ @param row_pos Position on head page where to put head part of
+ record
+ @param undo_lsn <> LSN_ERROR if we are executing an UNDO
+
+ @note
+ On return all pinned pages are released.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool write_block_record(MARIA_HA *info,
+ const uchar *old_record, const uchar *record,
+ MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *bitmap_blocks,
+ my_bool head_block_is_read,
+ struct st_row_pos_info *row_pos,
+ LSN undo_lsn)
+{
+ uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
+ uchar *row_extents_first_part, *row_extents_second_part;
+ uchar *field_length_data;
+ uchar *page_buff;
+ MARIA_BITMAP_BLOCK *block, *head_block;
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_PINNED_PAGE page_link;
+ uint block_size, flag;
+ ulong *blob_lengths;
+ my_bool row_extents_in_use, blob_full_pages_exists;
+ LSN lsn;
+ my_off_t position;
+ DBUG_ENTER("write_block_record");
+
+ LINT_INIT(row_extents_first_part);
+ LINT_INIT(row_extents_second_part);
+
+ head_block= bitmap_blocks->block;
+ block_size= share->block_size;
+
+ page_buff= row_pos->buff;
+ /* Position on head page where we should store the head part */
+ data= row_pos->data;
+ end_of_data= data + row_pos->length;
+
+ /* Write header */
+ flag= share->base.default_row_flag;
+ row_extents_in_use= 0;
+ if (unlikely(row->total_length > row_pos->length))
+ {
+ /* Need extent */
+ if (bitmap_blocks->count <= 1)
+ goto crashed; /* Wrong in bitmap */
+ flag|= ROW_FLAG_EXTENTS;
+ row_extents_in_use= 1;
+ }
+ /* For now we have only a minimum header */
+ *data++= (uchar) flag;
+ if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
+ *data++= (uchar) (share->base.null_bytes -
+ share->base.original_null_bytes);
+ if (row_extents_in_use)
+ {
+ /* Store first extent in header */
+ store_key_length_inc(data, bitmap_blocks->count - 1);
+ row_extents_first_part= data;
+ data+= ROW_EXTENT_SIZE;
+ }
+ if (share->base.pack_fields)
+ store_key_length_inc(data, row->field_lengths_length);
+ if (share->calc_checksum)
+ *(data++)= (uchar) (row->checksum); /* store least significant byte */
+ memcpy(data, record, share->base.null_bytes);
+ data+= share->base.null_bytes;
+ memcpy(data, row->empty_bits, share->base.pack_bytes);
+ data+= share->base.pack_bytes;
+
+ /*
+ Allocate a buffer of rest of data (except blobs)
+
+ To avoid double copying of data, we copy as many columns that fits into
+ the page. The rest goes into info->packed_row.
+
+ Using an extra buffer, instead of doing continuous writes to different
+ pages, uses less code and we don't need to have to do a complex call
+ for every data segment we want to store.
+ */
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ row->head_length))
+ DBUG_RETURN(1);
+
+ tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */
+ tmp_data= data;
+
+ if (row_extents_in_use)
+ {
+ uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
+ if (!tmp_data_used && tmp_data + copy_length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ row_extents_second_part= tmp_data;
+ /*
+ We will copy the extents here when we have figured out the tail
+ positions.
+ */
+ tmp_data+= copy_length;
+ }
+
+ /* Copy fields that has fixed lengths (primary key etc) */
+ for (column= share->columndef,
+ end_column= column + share->base.fixed_not_null_fields;
+ column < end_column; column++)
+ {
+ if (!tmp_data_used && tmp_data + column->length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ memcpy(tmp_data, record + column->offset, column->length);
+ tmp_data+= column->length;
+ }
+
+ /* Copy length of data for variable length fields */
+ if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ field_length_data= row->field_lengths;
+ memcpy(tmp_data, field_length_data, row->field_lengths_length);
+ tmp_data+= row->field_lengths_length;
+
+ /* Copy variable length fields and fields with null/zero */
+ for (end_column= share->columndef + share->base.fields - share->base.blobs;
+ column < end_column ;
+ column++)
+ {
+ const uchar *field_pos;
+ ulong length;
+ if ((record[column->null_pos] & column->null_bit) ||
+ (row->empty_bits[column->empty_pos] & column->empty_bit))
+ continue;
+
+ field_pos= record + column->offset;
+ switch (column->type) {
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_SKIP_PRESPACE:
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ length= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ /* Char that is space filled */
+ if (column->length <= 255)
+ length= (uint) (uchar) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+ break;
+ case FIELD_VARCHAR:
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) *field_length_data++;
+ field_pos++; /* Skip length uchar */
+ }
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ field_pos+= 2;
+ }
+ break;
+ default: /* Wrong data */
+ DBUG_ASSERT(0);
+ break;
+ }
+ if (!tmp_data_used && tmp_data + length > end_of_data)
+ {
+ /* Data didn't fit in page; Change to use tmp buffer */
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ memcpy((char*) tmp_data, (char*) field_pos, length);
+ tmp_data+= length;
+ }
+
+ block= head_block + head_block->sub_blocks; /* Point to first blob data */
+
+ end_column= column + share->base.blobs;
+ blob_lengths= row->blob_lengths;
+ if (!tmp_data_used)
+ {
+ /* Still room on page; Copy as many blobs we can into this page */
+ data= tmp_data;
+ for (; column < end_column &&
+ *blob_lengths <= (ulong)(end_of_data - data);
+ column++, blob_lengths++)
+ {
+ uchar *tmp_pos;
+ uint length;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length,
+ sizeof(char*));
+ memcpy(data, tmp_pos, *blob_lengths);
+ data+= *blob_lengths;
+ /* Skip over tail page that was to be used to store blob */
+ block++;
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ if (head_block->sub_blocks > 1)
+ {
+ /* We have allocated pages that where not used */
+ bitmap_blocks->page_skipped= 1;
+ }
+ }
+ else
+ data= tmp_data_used; /* Get last used on page */
+
+ {
+ /* Update page directory */
+ uint length= (uint) (data - row_pos->data);
+ DBUG_PRINT("info", ("Used head length on page: %u", length));
+ DBUG_ASSERT(data <= end_of_data);
+ if (length < info->s->base.min_row_length)
+ {
+ uint diff_length= info->s->base.min_row_length - length;
+ bzero(data, diff_length);
+ data+= diff_length;
+ length= info->s->base.min_row_length;
+ }
+ int2store(row_pos->dir + 2, length);
+ /* update empty space at start of block */
+ row_pos->empty_space-= length;
+ int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
+ /* Mark in bitmaps how the current page was actually used */
+ head_block->empty_space= row_pos->empty_space;
+ if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE)
+ head_block->empty_space= 0; /* Page is full */
+ head_block->used= BLOCKUSED_USED;
+ }
+
+ /*
+ Now we have to write tail pages, as we need to store the position
+ to them in the row extent header.
+
+ We first write out all blob tails, to be able to store them in
+ the current page or 'tmp_data'.
+
+ Then we write the tail of the non-blob fields (The position to the
+ tail page is stored either in row header, the extents in the head
+ page or in the first full page of the non-blob data. It's never in
+ the tail page of the non-blob data)
+ */
+
+ blob_full_pages_exists= 0;
+ if (row_extents_in_use)
+ {
+ if (column != end_column) /* If blob fields */
+ {
+ MARIA_COLUMNDEF *save_column= column;
+ MARIA_BITMAP_BLOCK *save_block= block;
+ MARIA_BITMAP_BLOCK *end_block;
+ ulong *save_blob_lengths= blob_lengths;
+
+ for (; column < end_column; column++, blob_lengths++)
+ {
+ uchar *blob_pos;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ {
+ uint length;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar *) &blob_pos, record + column->offset + length,
+ sizeof(char*));
+ length= *blob_lengths % FULL_PAGE_SIZE(block_size); /* tail size */
+ if (length != *blob_lengths)
+ blob_full_pages_exists= 1;
+ if (write_tail(info, block + block->sub_blocks-1,
+ blob_pos + *blob_lengths - length,
+ length))
+ goto disk_err;
+ }
+ else
+ blob_full_pages_exists= 1;
+
+ for (end_block= block + block->sub_blocks; block < end_block; block++)
+ {
+ /*
+ Set only a bit, to not cause bitmap code to believe a block is full
+ when there is still a lot of entries in it
+ */
+ block->used|= BLOCKUSED_USED;
+ }
+ }
+ column= save_column;
+ block= save_block;
+ blob_lengths= save_blob_lengths;
+ }
+
+ if (tmp_data_used) /* non blob data overflows */
+ {
+ MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
+ MARIA_BITMAP_BLOCK *head_tail_block= 0;
+ ulong length;
+ ulong data_length= (tmp_data - info->rec_buff);
+
+#ifdef SANITY_CHECKS
+ if (head_block->sub_blocks == 1)
+ goto crashed; /* no reserved full or tails */
+#endif
+ /*
+ Find out where to write tail for non-blob fields.
+
+ Problem here is that the bitmap code may have allocated more
+ space than we need. We have to handle the following cases:
+
+ - Bitmap code allocated a tail page we don't need.
+ - The last full page allocated needs to be changed to a tail page
+ (Because we where able to put more data on the head page than
+ the bitmap allocation assumed)
+
+ The reserved pages in bitmap_blocks for the main page has one of
+ the following allocations:
+ - Full pages, with following blocks:
+ # * full pages
+ empty page ; To be used if we change last full to tail page. This
+ has 'count' = 0.
+ tail page (optional, if last full page was part full)
+ - One tail page
+ */
+
+ cur_block= head_block + 1;
+ end_block= head_block + head_block->sub_blocks;
+ /*
+ Loop until we have find a block bigger than we need or
+ we find the empty page block.
+ */
+ while (data_length >= (length= (cur_block->page_count *
+ FULL_PAGE_SIZE(block_size))) &&
+ cur_block->page_count)
+ {
+#ifdef SANITY_CHECKS
+ if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
+ goto crashed;
+#endif
+ data_length-= length;
+ (cur_block++)->used= BLOCKUSED_USED;
+ }
+ last_head_block= cur_block;
+ if (data_length)
+ {
+ if (cur_block->page_count == 0)
+ {
+ /* Skip empty filler block */
+ cur_block++;
+ }
+#ifdef SANITY_CHECKS
+ if ((cur_block >= end_block))
+ goto crashed;
+#endif
+ if (cur_block->used & BLOCKUSED_TAIL)
+ {
+ DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
+ /* tail written to full tail page */
+ cur_block->used= BLOCKUSED_USED;
+ head_tail_block= cur_block;
+ }
+ else if (data_length > length - MAX_TAIL_SIZE(block_size))
+ {
+ /* tail written to full page */
+ cur_block->used= BLOCKUSED_USED;
+ if ((cur_block != end_block - 1) &&
+ (end_block[-1].used & BLOCKUSED_TAIL))
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ else
+ {
+ /*
+ cur_block is a full block, followed by an empty and optional
+ tail block. Change cur_block to a tail block or split it
+ into full blocks and tail blocks.
+
+ TODO:
+ If there is enough space on the following tail block, use
+ this instead of creating a new tail block.
+ */
+ DBUG_ASSERT(cur_block[1].page_count == 0);
+ if (cur_block->page_count == 1)
+ {
+ /* convert full block to tail block */
+ cur_block->used= BLOCKUSED_USED | BLOCKUSED_TAIL;
+ head_tail_block= cur_block;
+ }
+ else
+ {
+ DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size));
+ DBUG_PRINT("info", ("Splitting blocks into full and tail"));
+ cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
+ cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */
+ cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
+ cur_block->page_count--;
+ cur_block->used= BLOCKUSED_USED;
+ last_head_block= head_tail_block= cur_block+1;
+ }
+ if (end_block[-1].used & BLOCKUSED_TAIL)
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ }
+ else
+ {
+ /* Must be an empty or tail page */
+ DBUG_ASSERT(cur_block->page_count == 0 ||
+ cur_block->used & BLOCKUSED_TAIL);
+ if (end_block[-1].used & BLOCKUSED_TAIL)
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+
+ /*
+ Write all extents into page or tmp_data
+
+ Note that we still don't have a correct position for the tail
+ of the non-blob fields.
+ */
+ store_extent_info(row_extents_first_part,
+ row_extents_second_part,
+ head_block+1, bitmap_blocks->count - 1);
+ if (head_tail_block)
+ {
+ ulong data_length= (tmp_data - info->rec_buff);
+ uint length;
+ uchar *extent_data;
+
+ length= (uint) (data_length % FULL_PAGE_SIZE(block_size));
+ if (write_tail(info, head_tail_block,
+ info->rec_buff + data_length - length,
+ length))
+ goto disk_err;
+ tmp_data-= length; /* Remove the tail */
+ if (tmp_data == info->rec_buff)
+ {
+ /* We have no full blocks to write for the head part */
+ tmp_data_used= 0;
+ }
+
+ /* Store the tail position for the non-blob fields */
+ if (head_tail_block == head_block + 1)
+ {
+ /*
+ We had a head block + tail block, which means that the
+ tail block is the first extent
+ */
+ extent_data= row_extents_first_part;
+ }
+ else
+ {
+ /*
+ We have a head block + some full blocks + tail block
+ last_head_block is pointing after the last used extent
+ for the head block.
+ */
+ extent_data= row_extents_second_part +
+ ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
+ }
+ DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT);
+ page_store(extent_data, head_tail_block->page);
+ int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count);
+ }
+ }
+ else
+ store_extent_info(row_extents_first_part,
+ row_extents_second_part,
+ head_block+1, bitmap_blocks->count - 1);
+ }
+
+ if (share->now_transactional)
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ size_t data_length= (size_t) (data - row_pos->data);
+
+ /* Log REDO changes of head page */
+ page_store(log_data + FILEID_STORE_SIZE, head_block->page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ row_pos->rownr);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) row_pos->data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_length;
+ if (translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_HEAD, info->trn,
+ info, sizeof(log_data) + data_length,
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data))
+ goto disk_err;
+ }
+
+ /* Increase data file size, if extended */
+ position= (my_off_t) head_block->page * block_size;
+ if (info->state->data_file_length <= position)
+ info->state->data_file_length= position + block_size;
+
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (pagecache_write(share->pagecache,
+ &info->dfile, head_block->page, 0,
+ page_buff, share->page_type,
+ head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
+ PAGECACHE_LOCK_READ,
+ head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY, &page_link.link))
+ goto disk_err;
+ page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ if (head_block_is_read)
+ {
+ /* Head page is always the first pinned page */
+ set_dynamic(&info->pinned_pages, (void*) &page_link, 0);
+ }
+ else
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
+ {
+ /*
+ Log REDO writes for all full pages (head part and all blobs)
+ We write all here to be able to generate the UNDO record early
+ so that we can write the LSN for the UNDO record to all full pages.
+ */
+ uchar tmp_log_data[FILEID_STORE_SIZE + LSN_STORE_SIZE + PAGE_STORE_SIZE +
+ ROW_EXTENT_SIZE * ROW_EXTENTS_ON_STACK];
+ uchar *log_data, *log_pos;
+ LEX_STRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
+ ROW_EXTENTS_ON_STACK];
+ LEX_STRING *log_array_pos, *log_array;
+ int error;
+ ulong log_entry_length= 0;
+
+ /* If few extents, then allocate things on stack to avoid a malloc call */
+ if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
+ {
+ log_array= tmp_log_array;
+ log_data= tmp_log_data;
+ }
+ else
+ {
+ if (my_multi_malloc(MY_WME, &log_array,
+ (uint) ((bitmap_blocks->count +
+ TRANSLOG_INTERNAL_PARTS + 2) *
+ sizeof(*log_array)),
+ &log_data, bitmap_blocks->count * ROW_EXTENT_SIZE,
+ NullS))
+ goto disk_err;
+ }
+ log_pos= log_data + FILEID_STORE_SIZE;
+ log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
+
+ if (tmp_data_used)
+ {
+ /* Full head pages */
+ size_t data_length= (ulong) (tmp_data - info->rec_buff);
+ log_pos= store_page_range(log_pos, head_block+1, block_size,
+ data_length);
+ log_array_pos->str= (char*) info->rec_buff;
+ log_array_pos->length= data_length;
+ log_entry_length+= data_length;
+ log_array_pos++;
+ }
+ if (blob_full_pages_exists)
+ {
+ MARIA_COLUMNDEF *tmp_column= column;
+ ulong *tmp_blob_lengths= blob_lengths;
+ MARIA_BITMAP_BLOCK *tmp_block= block;
+
+ /* Full blob pages */
+ for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
+ {
+ ulong blob_length;
+ uint length;
+
+ if (!*tmp_blob_lengths) /* Null or "" */
+ continue;
+ length= tmp_column->length - portable_sizeof_char_ptr;
+ blob_length= *tmp_blob_lengths;
+ if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+ if (blob_length)
+ {
+ memcpy_fixed((uchar*) &log_array_pos->str,
+ record + column->offset + length,
+ sizeof(uchar*));
+ log_array_pos->length= blob_length;
+ log_entry_length+= blob_length;
+ log_array_pos++;
+
+ log_pos= store_page_range(log_pos, tmp_block, block_size,
+ blob_length);
+ tmp_block+= tmp_block->sub_blocks;
+ }
+ }
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (size_t) (log_pos -
+ log_data);
+ log_entry_length+= (log_pos - log_data);
+
+ /* trn->rec_lsn is already set earlier in this function */
+ error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
+ info->trn, info, log_entry_length,
+ (uint) (log_array_pos - log_array),
+ log_array, log_data);
+ if (log_array != tmp_log_array)
+ my_free((uchar*) log_array, MYF(0));
+ if (error)
+ goto disk_err;
+ }
+
+ /* Write UNDO or CLR record */
+ lsn= 0;
+ if (share->now_transactional)
+ {
+ LEX_STRING *log_array= info->log_row_parts;
+
+ if (undo_lsn != LSN_ERROR)
+ {
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1];
+ /* undo_lsn must be first for compression to work */
+ lsn_store(log_data, undo_lsn);
+ /*
+ Store if this CLR is about an UNDO_INSERT, UNDO_DELETE or UNDO_UPDATE;
+ in the first/second case, Recovery, when it sees the CLR_END in the
+ REDO phase, may decrement/increment the records' count.
+ */
+ log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= old_record ?
+ LOGREC_UNDO_ROW_UPDATE : LOGREC_UNDO_ROW_DELETE;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (translog_write_record(&lsn, LOGREC_CLR_END,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE))
+ goto disk_err;
+ }
+ else
+ {
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+
+ /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_INSERT share same header */
+ lsn_store(log_data, info->trn->undo_lsn);
+ page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
+ head_block->page);
+ dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE,
+ row_pos->rownr);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (!old_record)
+ {
+ /* Write UNDO log record for the INSERT */
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE))
+ goto disk_err;
+ }
+ else
+ {
+ /* Write UNDO log record for the UPDATE */
+ size_t row_length;
+ uint row_parts_count;
+ row_length= fill_update_undo_parts(info, old_record, record,
+ log_array +
+ TRANSLOG_INTERNAL_PARTS + 1,
+ &row_parts_count);
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
+ info, sizeof(log_data) + row_length,
+ TRANSLOG_INTERNAL_PARTS + 1 +
+ row_parts_count,
+ log_array, log_data + LSN_STORE_SIZE))
+ goto disk_err;
+ }
+ }
+ }
+ _ma_unpin_all_pages(info, lsn);
+
+ if (tmp_data_used)
+ {
+ /*
+ Write data stored in info->rec_buff to pages
+ This is the char/varchar data that didn't fit into the head page.
+ */
+ DBUG_ASSERT(bitmap_blocks->count != 0);
+ if (write_full_pages(info, info->trn->undo_lsn, head_block + 1,
+ info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
+ goto disk_err;
+ }
+
+ /* Write rest of blobs (data, but no tails as they are already written) */
+ for (; column < end_column; column++, blob_lengths++)
+ {
+ uchar *blob_pos;
+ uint length;
+ ulong blob_length;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar*) &blob_pos, record + column->offset + length,
+ sizeof(char*));
+ /* remove tail part */
+ blob_length= *blob_lengths;
+ if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+
+ if (blob_length && write_full_pages(info, info->trn->undo_lsn, block,
+ blob_pos, blob_length))
+ goto disk_err;
+ block+= block->sub_blocks;
+ }
+ /* Release not used space in used pages */
+ if (_ma_bitmap_release_unused(info, bitmap_blocks))
+ goto disk_err;
+
+ _ma_finalize_row(info);
+ DBUG_RETURN(0);
+
+crashed:
+ /* Something was wrong with data on page */
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+
+disk_err:
+ /**
+ @todo RECOVERY we are going to let dirty pages go to disk while we have
+ logged UNDO, this violates WAL. We must mark the table corrupted!
+
+ @todo RECOVERY we have written some REDOs without a closing UNDO,
+ it's possible that a next operation by this transaction succeeds and then
+ Recovery would glue the "orphan REDOs" to the succeeded operation and
+ execute the failed REDOs. We need some mark "abort this group" in the
+ log, or mark the table corrupted (then user will repair it and thus REDOs
+ will be skipped).
+
+ @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
+ should take a MARIA_HA* in argument, and it it
+ fails when flushing a page to disk it should call
+ (*the_maria_ha->write_error_func)(the_maria_ha)
+ and this hook will mark the table corrupted.
+ Maybe hook should be stored in the pagecache's block structure, or in a
+ hash "file->maria_ha*".
+
+ @todo RECOVERY we should distinguish below between log write error and
+ table write error. The former should stop Maria immediately, the latter
+ should mark the table corrupted.
+ */
+ /*
+ Unpin all pinned pages to not cause problems for disk cache. This is
+ safe to call even if we already called _ma_unpin_all_pages() above.
+ */
+ _ma_unpin_all_pages_and_finalize_row(info, 0);
+
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Write a record
+
+ @fn allocate_and_write_block_record()
+ @param info Maria handler
+ @param record Record to write
+ @param row Information about fields in 'record'
+ @param undo_lsn <> LSN_ERROR if we are executing an UNDO
+
+ @return
+ @retval 0 ok
+ @retval 1 Error
+*/
+
+static my_bool allocate_and_write_block_record(MARIA_HA *info,
+ const uchar *record,
+ MARIA_ROW *row,
+ LSN undo_lsn)
+{
+ struct st_row_pos_info row_pos;
+ MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
+ DBUG_ENTER("allocate_and_write_block_record");
+
+ if (_ma_bitmap_find_place(info, row, blocks))
+ DBUG_RETURN(1); /* Error reading bitmap */
+ /* page will be pinned & locked by get_head_or_tail_page */
+ if (get_head_or_tail_page(info, blocks->block, info->buff,
+ row->space_on_head_page, HEAD_PAGE,
+ PAGECACHE_LOCK_WRITE, &row_pos))
+ DBUG_RETURN(1);
+ row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
+ if (info->s->calc_checksum)
+ row->checksum= (info->s->calc_checksum)(info, record);
+ if (write_block_record(info, (uchar*) 0, record, row,
+ blocks, blocks->block->org_bitmap_value != 0,
+ &row_pos, undo_lsn))
+ DBUG_RETURN(1); /* Error reading bitmap */
+ DBUG_PRINT("exit", ("Rowid: %lu (%lu:%u)", (ulong) row->lastpos,
+ (ulong) ma_recordpos_to_page(row->lastpos),
+ ma_recordpos_to_dir_entry(row->lastpos)));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write a record and return rowid for it
+
+ SYNOPSIS
+ _ma_write_init_block_record()
+ info Maria handler
+ record Record to write
+
+ NOTES
+ This is done BEFORE we write the keys to the row!
+
+ RETURN
+ HA_OFFSET_ERROR Something went wrong
+ # Rowid for row
+*/
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+ const uchar *record)
+{
+ DBUG_ENTER("_ma_write_init_block_record");
+
+ calc_record_size(info, record, &info->cur_row);
+ if (allocate_and_write_block_record(info, record,
+ &info->cur_row, LSN_ERROR))
+ DBUG_RETURN(HA_OFFSET_ERROR);
+ DBUG_RETURN(info->cur_row.lastpos);
+}
+
+
+/*
+ Dummy function for (*info->s->write_record)()
+
+ Nothing to do here, as we already wrote the record in
+ _ma_write_init_block_record()
+*/
+
+my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
+ const uchar *record __attribute__ ((unused)))
+{
+ return 0; /* Row already written */
+}
+
+
+/**
+ @brief Remove row written by _ma_write_block_record() and log undo
+
+ @param info Maria handler
+
+ @note
+ This is called in case we got a duplicate unique key while
+ writing keys.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool _ma_write_abort_block_record(MARIA_HA *info)
+{
+ my_bool res= 0;
+ MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+ MARIA_BITMAP_BLOCK *block, *end;
+ LSN lsn= LSN_IMPOSSIBLE;
+ DBUG_ENTER("_ma_write_abort_block_record");
+
+ if (delete_head_or_tail(info,
+ ma_recordpos_to_page(info->cur_row.lastpos),
+ ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
+ 0))
+ res= 1;
+ for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
+ block++)
+ {
+ if (block->used & BLOCKUSED_TAIL)
+ {
+ /*
+ block->page_count is set to the tail directory entry number in
+ write_block_record()
+ */
+ if (delete_head_or_tail(info, block->page, block->page_count & ~TAIL_BIT,
+ 0, 0))
+ res= 1;
+ }
+ else if (block->used & BLOCKUSED_USED)
+ {
+ if (free_full_page_range(info, block->page, block->page_count))
+ res= 1;
+ }
+ }
+
+ if (info->s->now_transactional)
+ {
+ LSN previous_undo_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1];
+ int len;
+ /*
+ We do need the code above (delete_head_or_tail() etc) for
+ non-transactional tables.
+ For transactional tables we could skip this code above and just execute
+ the UNDO_INSERT, but we try to have one code path.
+ Write CLR record, because we are somehow undoing UNDO_ROW_INSERT.
+ When we have logging for keys: as maria_write() first writes the row
+ then the keys, and if failure, deletes the keys then the rows,
+ info->trn->undo_lsn below will properly point to the UNDO of the
+ UNDO_ROW_INSERT for this row.
+ */
+ if ((len= translog_read_record_header(info->trn->undo_lsn, &rec)) ==
+ RECHEADER_READ_ERROR)
+ {
+ res= 1;
+ goto end;
+ }
+ DBUG_ASSERT(rec.type == LOGREC_UNDO_ROW_INSERT);
+ previous_undo_lsn= lsn_korr(rec.header);
+ lsn_store(log_data, previous_undo_lsn);
+ log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= LOGREC_UNDO_ROW_INSERT;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (translog_write_record(&lsn, LOGREC_CLR_END,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE))
+ res= 1;
+ }
+end:
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Update a record
+
+ NOTES
+ For the moment, we assume that info->curr_row.extents is always updated
+ when a row is read. In the future we may decide to read this on demand
+ for rows split into many extents.
+*/
+
+static my_bool _ma_update_block_record2(MARIA_HA *info,
+ MARIA_RECORD_POS record_pos,
+ const uchar *oldrec,
+ const uchar *record,
+ LSN undo_lsn)
+{
+ MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+ uchar *buff;
+ MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
+ MARIA_PINNED_PAGE page_link;
+ uint rownr, org_empty_size, head_length;
+ uint block_size= info->s->block_size;
+ uchar *dir;
+ ulonglong page;
+ struct st_row_pos_info row_pos;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_update_block_record2");
+ DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
+
+#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
+ DBUG_DUMP("oldrec", oldrec, share->base.reclength);
+ DBUG_DUMP("newrec", record, share->base.reclength);
+#endif
+
+ /* checksum was computed by maria_update() already and put into cur_row */
+ new_row->checksum= cur_row->checksum;
+ calc_record_size(info, record, new_row);
+ page= ma_recordpos_to_page(record_pos);
+
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile, (my_off_t) page, 0,
+ info->buff, share->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link.link)))
+ DBUG_RETURN(1);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ rownr= ma_recordpos_to_dir_entry(record_pos);
+ dir= (buff + block_size - DIR_ENTRY_SIZE * rownr -
+ DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE);
+
+ if ((org_empty_size + cur_row->head_length) >= new_row->total_length)
+ {
+ uint rec_offset, length;
+ MARIA_BITMAP_BLOCK block;
+
+ /*
+ We can fit the new row in the same page as the original head part
+ of the row
+ */
+ block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
+ org_empty_size);
+
+ if (extend_area_on_page(buff, dir, rownr, share->block_size,
+ new_row->total_length, &org_empty_size,
+ &rec_offset, &length))
+ DBUG_RETURN(1);
+
+ row_pos.buff= buff;
+ row_pos.rownr= rownr;
+ row_pos.empty_space= org_empty_size;
+ row_pos.dir= dir;
+ row_pos.data= buff + rec_offset;
+ row_pos.length= length;
+ blocks->block= &block;
+ blocks->count= 1;
+ block.page= page;
+ block.sub_blocks= 1;
+ block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
+ block.empty_space= row_pos.empty_space;
+ /* Update cur_row, if someone calls update at once again */
+ cur_row->head_length= new_row->total_length;
+
+ if (cur_row->extents_count && free_full_pages(info, cur_row))
+ goto err;
+ DBUG_RETURN(write_block_record(info, oldrec, record, new_row, blocks,
+ 1, &row_pos, undo_lsn));
+ }
+ /*
+ Allocate all size in block for record
+ TODO:
+ Need to improve this to do compact if we can fit one more blob into
+ the head page
+ */
+ head_length= uint2korr(dir + 2);
+ if (buff[PAGE_TYPE_OFFSET] & PAGE_CAN_BE_COMPACTED && org_empty_size &&
+ (head_length < new_row->head_length ||
+ (new_row->total_length <= head_length &&
+ org_empty_size + head_length >= new_row->total_length)))
+ {
+ compact_page(buff, share->block_size, rownr, 1);
+ org_empty_size= 0;
+ head_length= uint2korr(dir + 2);
+ }
+
+ /* Delete old row */
+ if (*cur_row->tail_positions && delete_tails(info, cur_row->tail_positions))
+ goto err;
+ if (cur_row->extents_count && free_full_pages(info, cur_row))
+ goto err;
+ if (_ma_bitmap_find_new_place(info, new_row, page, head_length, blocks))
+ goto err;
+
+ row_pos.buff= buff;
+ row_pos.rownr= rownr;
+ row_pos.empty_space= org_empty_size + head_length;
+ row_pos.dir= dir;
+ row_pos.data= buff + uint2korr(dir);
+ row_pos.length= head_length;
+ DBUG_RETURN(write_block_record(info, oldrec, record, new_row, blocks, 1,
+ &row_pos, undo_lsn));
+
+err:
+ _ma_unpin_all_pages_and_finalize_row(info, 0);
+ DBUG_RETURN(1);
+}
+
+
+/* Wrapper for _ma_update_block_record2() used by ma_update() */
+
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
+ const uchar *orig_rec, const uchar *new_rec)
+{
+ return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
+ LSN_ERROR);
+}
+
+
+/*
+ Delete a directory entry
+
+ SYNOPSIS
+ delete_dir_entry()
+ buff Page buffer
+ block_size Block size
+ record_number Record number to delete
+ empty_space Empty space on page after delete
+
+ RETURN
+ -1 Error on page
+ 0 ok
+ 1 Page is now empty
+*/
+
+static int delete_dir_entry(uchar *buff, uint block_size, uint record_number,
+ uint *empty_space_res)
+{
+ uint number_of_records= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET];
+ uint length, empty_space;
+ uchar *dir, *org_dir;
+ DBUG_ENTER("delete_dir_entry");
+
+#ifdef SANITY_CHECKS
+ if (record_number >= number_of_records ||
+ record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
+ PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
+ {
+ DBUG_PRINT("error", ("record_number: %u number_of_records: %u",
+ record_number, number_of_records));
+
+ DBUG_RETURN(-1);
+ }
+#endif
+
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ org_dir= dir= (buff + block_size - DIR_ENTRY_SIZE * record_number -
+ DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE);
+ length= uint2korr(dir + 2);
+
+ if (record_number == number_of_records - 1)
+ {
+ /* Delete this entry and all following empty directory entries */
+ uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
+ do
+ {
+ number_of_records--;
+ dir+= DIR_ENTRY_SIZE;
+ empty_space+= DIR_ENTRY_SIZE;
+ } while (dir < end && dir[0] == 0 && dir[1] == 0);
+
+ if (number_of_records == 0)
+ {
+ buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+ *empty_space_res= block_size;
+ DBUG_RETURN(1);
+ }
+ buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
+ }
+ empty_space+= length;
+
+ /* Update directory */
+ org_dir[0]= org_dir[1]= 0; org_dir[2]= org_dir[3]= 0; /* Delete entry */
+ int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+ buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
+
+ *empty_space_res= empty_space;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Delete a head a tail part
+
+ SYNOPSIS
+ delete_head_or_tail()
+ info Maria handler
+ page Page (not file offset!) on which the row is
+ head 1 if this is a head page
+ from_update 1 if we are called from update. In this case we
+ leave the page as write locked as we may put
+ the new row into the old position.
+
+ NOTES
+ Uses info->keyread_buff
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool delete_head_or_tail(MARIA_HA *info,
+ ulonglong page, uint record_number,
+ my_bool head, my_bool from_update)
+{
+ MARIA_SHARE *share= info->s;
+ uint empty_space;
+ uint block_size= share->block_size;
+ uchar *buff;
+ LSN lsn;
+ MARIA_PINNED_PAGE page_link;
+ int res;
+ enum pagecache_page_lock lock_at_write, lock_at_unpin;
+ DBUG_ENTER("delete_head_or_tail");
+
+ info->keyread_buff_used= 1;
+ DBUG_ASSERT(info->s->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile, page, 0,
+ info->keyread_buff,
+ info->s->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link.link)))
+ DBUG_RETURN(1);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ if (from_update)
+ {
+ lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
+ }
+ else
+ {
+ lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
+ lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
+ }
+
+ res= delete_dir_entry(buff, block_size, record_number, &empty_space);
+ if (res < 0)
+ DBUG_RETURN(1);
+ if (res == 0) /* after our deletion, page is still not empty */
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ if (info->s->now_transactional)
+ {
+ /* Log REDO data */
+ page_store(log_data + FILEID_STORE_SIZE, page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ record_number);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
+ LOGREC_REDO_PURGE_ROW_TAIL),
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data))
+ DBUG_RETURN(1);
+ }
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, share->page_type,
+ lock_at_write,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY, &page_link.link))
+ DBUG_RETURN(1);
+ }
+ else /* page is now empty */
+ {
+ if (info->s->now_transactional)
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ PAGE_STORE_SIZE + PAGERANGE_STORE_SIZE];
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ pagerange_store(log_data + FILEID_STORE_SIZE, 1);
+ page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, page);
+ pagerange_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ PAGE_STORE_SIZE, 1);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data))
+ DBUG_RETURN(1);
+ }
+ /* Write the empty page (needed only for REPAIR to work) */
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, share->page_type,
+ lock_at_write,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY, &page_link.link))
+ DBUG_RETURN(1);
+
+ DBUG_ASSERT(empty_space >= info->s->bitmap.sizes[0]);
+ }
+ /* The page is pinned with a read lock */
+ page_link.unlock= lock_at_unpin;
+ set_dynamic(&info->pinned_pages, (void*) &page_link,
+ info->pinned_pages.elements-1);
+
+ DBUG_PRINT("info", ("empty_space: %u", empty_space));
+ DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
+}
+
+
+/*
+ delete all tails
+
+ SYNOPSIS
+ delete_tails()
+ info Handler
+ tails Pointer to vector of tail positions, ending with 0
+
+ NOTES
+ Uses info->keyread_buff
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
+{
+ my_bool res= 0;
+ DBUG_ENTER("delete_tails");
+ for (; *tails; tails++)
+ {
+ if (delete_head_or_tail(info,
+ ma_recordpos_to_page(*tails),
+ ma_recordpos_to_dir_entry(*tails), 0, 1))
+ res= 1;
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Delete a record
+
+ NOTES
+ For the moment, we assume that info->cur_row.extents is always updated
+ when a row is read. In the future we may decide to read this on demand
+ for rows with many splits.
+*/
+
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
+{
+ ulonglong page;
+ uint record_number;
+ DBUG_ENTER("_ma_delete_block_record");
+
+ page= ma_recordpos_to_page(info->cur_row.lastpos);
+ record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
+ DBUG_PRINT("enter", ("Rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
+ (ulong) page, record_number));
+
+ if (delete_head_or_tail(info, page, record_number, 1, 0) ||
+ delete_tails(info, info->cur_row.tail_positions))
+ goto err;
+
+ if (info->cur_row.extents && free_full_pages(info, &info->cur_row))
+ goto err;
+
+ if (info->s->now_transactional)
+ {
+ LSN lsn;
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
+ DIR_COUNT_SIZE];
+ size_t row_length;
+ uint row_parts_count;
+
+ /* Write UNDO record */
+ lsn_store(log_data, info->trn->undo_lsn);
+ page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
+ dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE, record_number);
+
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= (char*) log_data;
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length= sizeof(log_data);
+ row_length= fill_insert_undo_parts(info, record, info->log_row_parts +
+ TRANSLOG_INTERNAL_PARTS + 1,
+ &row_parts_count);
+
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
+ info, sizeof(log_data) + row_length,
+ TRANSLOG_INTERNAL_PARTS + 1 + row_parts_count,
+ info->log_row_parts, log_data + LSN_STORE_SIZE))
+ goto err;
+
+ }
+
+ _ma_unpin_all_pages_and_finalize_row(info, info->trn->undo_lsn);
+ DBUG_RETURN(0);
+
+err:
+ _ma_unpin_all_pages_and_finalize_row(info, 0);
+ DBUG_RETURN(1);
+}
+
+
+/****************************************************************************
+ Reading of records
+****************************************************************************/
+
+/*
+ Read position to record from record directory at end of page
+
+ SYNOPSIS
+ get_record_position()
+ buff page buffer
+ block_size block size for page
+ record_number Record number in index
+ end_of_data pointer to end of data for record
+
+ RETURN
+ 0 Error in data
+ # Pointer to start of record.
+ In this case *end_of_data is set.
+*/
+
+static uchar *get_record_position(uchar *buff, uint block_size,
+ uint record_number, uchar **end_of_data)
+{
+ uint number_of_records= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET];
+ uchar *dir;
+ uchar *data;
+ uint offset, length;
+
+#ifdef SANITY_CHECKS
+ if (record_number >= number_of_records ||
+ record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) /
+ DIR_ENTRY_SIZE))
+ {
+ DBUG_PRINT("error",
+ ("Wrong row number: record_number: %u number_of_records: %u",
+ record_number, number_of_records));
+ return 0;
+ }
+#endif
+
+ dir= (buff + block_size - DIR_ENTRY_SIZE * record_number -
+ DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE);
+ offset= uint2korr(dir);
+ length= uint2korr(dir + 2);
+#ifdef SANITY_CHECKS
+ if (offset < PAGE_HEADER_SIZE ||
+ offset + length > (block_size -
+ number_of_records * DIR_ENTRY_SIZE -
+ PAGE_SUFFIX_SIZE))
+ {
+ DBUG_PRINT("error",
+ ("Wrong row position: record_number: %u offset: %u "
+ "length: %u number_of_records: %u",
+ record_number, offset, length, number_of_records));
+ return 0;
+ }
+#endif
+ data= buff + offset;
+ *end_of_data= data + length;
+ return data;
+}
+
+
+/*
+ Init extent
+
+ NOTES
+ extent is a cursor over which pages to read
+*/
+
+static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
+ uint extents, MARIA_RECORD_POS *tail_positions)
+{
+ uint page_count;
+ extent->extent= extent_info;
+ extent->extent_count= extents;
+ extent->page= page_korr(extent_info); /* First extent */
+ page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
+ extent->tail= page_count & TAIL_BIT;
+ if (extent->tail)
+ {
+ extent->page_count= 1;
+ extent->tail_row_nr= page_count & ~TAIL_BIT;
+ }
+ else
+ extent->page_count= page_count;
+ extent->tail_positions= tail_positions;
+ extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
+}
+
+
+/*
+ Read next extent
+
+ SYNOPSIS
+ read_next_extent()
+ info Maria handler
+ extent Pointer to current extent (this is updated to point
+ to next)
+ end_of_data Pointer to end of data in read block (out)
+
+ NOTES
+ New block is read into info->buff
+
+ RETURN
+ 0 Error; my_errno is set
+ # Pointer to start of data in read block
+ In this case end_of_data is updated to point to end of data.
+*/
+
+static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
+ uchar **end_of_data)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *buff, *data;
+ MARIA_PINNED_PAGE page_link;
+ enum pagecache_page_lock lock;
+ DBUG_ENTER("read_next_extent");
+
+ if (!extent->page_count)
+ {
+ uint page_count;
+ if (!--extent->extent_count)
+ goto crashed;
+ extent->extent+= ROW_EXTENT_SIZE;
+ extent->page= page_korr(extent->extent);
+ page_count= uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE);
+ if (!page_count)
+ goto crashed;
+ extent->tail= page_count & TAIL_BIT;
+ if (extent->tail)
+ extent->tail_row_nr= page_count & ~TAIL_BIT;
+ else
+ extent->page_count= page_count;
+ DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d",
+ (ulong) extent->page, extent->page_count,
+ extent->tail != 0));
+ }
+ extent->first_extent= 0;
+
+ lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
+ if (extent->tail)
+ lock= extent->lock_for_tail_pages;
+
+ DBUG_ASSERT(share->pagecache->block_size == share->block_size);
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile, extent->page, 0,
+ info->buff, share->page_type,
+ lock, &page_link.link)))
+ {
+ /* check if we tried to read over end of file (ie: bad data in record) */
+ if ((extent->page + 1) * share->block_size > info->state->data_file_length)
+ goto crashed;
+ DBUG_RETURN(0);
+ }
+
+ if (!extent->tail)
+ {
+ /* Full data page */
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
+ goto crashed;
+ extent->page++; /* point to next page */
+ extent->page_count--;
+ *end_of_data= buff + share->block_size;
+ info->cur_row.full_page_count++; /* For maria_chk */
+ DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE);
+ }
+
+ /* Found tail */
+ if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
+ {
+ /* Read during redo */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
+ goto crashed;
+ *(extent->tail_positions++)= ma_recordpos(extent->page,
+ extent->tail_row_nr);
+ info->cur_row.tail_count++; /* For maria_chk */
+
+ if (!(data= get_record_position(buff, share->block_size,
+ extent->tail_row_nr,
+ end_of_data)))
+ goto crashed;
+ extent->data_start= data;
+ extent->page_count= 0; /* No more data in extent */
+ DBUG_RETURN(data);
+
+
+crashed:
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_PRINT("error", ("wrong extent information"));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read data that may be split over many blocks
+
+ SYNOPSIS
+ read_long_data()
+ info Maria handler
+ to Store result string here (this is allocated)
+ extent Pointer to current extent position
+ data Current position in buffer
+ end_of_data End of data in buffer
+
+ NOTES
+ When we have to read a new buffer, it's read into info->buff
+
+ This loop is implemented by goto's instead of a for() loop as
+ the code is notable smaller and faster this way (and it's not nice
+ to jump into a for loop() or into a 'then' clause)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
+ MARIA_EXTENT_CURSOR *extent,
+ uchar **data, uchar **end_of_data)
+{
+ DBUG_ENTER("read_long_data");
+ DBUG_PRINT("enter", ("length: %lu", length));
+ DBUG_ASSERT(*data <= *end_of_data);
+
+ /*
+ Fields are never split in middle. This means that if length > rest-of-data
+ we should start reading from the next extent. The reason we may have
+ data left on the page is that there fixed part of the row was less than
+ min_row_length and in this case the head block was extended to
+ min_row_length.
+
+ This may change in the future, which is why we have the loop written
+ the way it's written.
+ */
+ if (extent->first_extent && length > (ulong) (*end_of_data - *data))
+ *end_of_data= *data;
+
+ for(;;)
+ {
+ uint left_length;
+ left_length= (uint) (*end_of_data - *data);
+ if (likely(left_length >= length))
+ {
+ memcpy(to, *data, length);
+ (*data)+= length;
+ DBUG_RETURN(0);
+ }
+ memcpy(to, *data, left_length);
+ to+= left_length;
+ length-= left_length;
+ if (!(*data= read_next_extent(info, extent, end_of_data)))
+ break;
+ }
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Read a record from page (helper function for _ma_read_block_record())
+
+ SYNOPSIS
+ _ma_read_block_record2()
+ info Maria handler
+ record Store record here
+ data Start of head data for row
+ end_of_data End of data for row
+
+ NOTES
+ The head page is already read by caller
+ Following data is update in info->cur_row:
+
+ cur_row.head_length is set to size of entry in head block
+ cur_row.tail_positions is set to point to all tail blocks
+ cur_row.extents points to extents data
+ cur_row.extents_counts contains number of extents
+ cur_row.empty_bits is set to empty bits
+ cur_row.field_lengths contains packed length of all fields
+ cur_row.blob_length contains total length of all blobs.
+
+ RETURN
+ 0 ok
+ # Error code
+*/
+
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+ uchar *data, uchar *end_of_data)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *field_length_data, *blob_buffer, *start_of_data;
+ uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
+ my_bool found_blob= 0;
+ MARIA_EXTENT_CURSOR extent;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_ROW *cur_row= &info->cur_row;
+ DBUG_ENTER("_ma_read_block_record2");
+
+ LINT_INIT(field_length_data);
+ LINT_INIT(blob_buffer);
+
+ start_of_data= data;
+ flag= (uint) (uchar) data[0];
+ cur_null_bytes= share->base.original_null_bytes;
+ null_bytes= share->base.null_bytes;
+ cur_row->head_length= (uint) (end_of_data - data);
+ cur_row->full_page_count= cur_row->tail_count= 0;
+ cur_row->blob_length= 0;
+
+ /* Skip trans header (for now, until we have MVCC csupport) */
+ data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
+ if (flag & ROW_FLAG_NULLS_EXTENDED)
+ cur_null_bytes+= data[-1];
+
+ row_extents= 0;
+ if (flag & ROW_FLAG_EXTENTS)
+ {
+ uint row_extent_size;
+ /*
+ Record is split over many data pages.
+ Get number of extents and first extent
+ */
+ get_key_length(row_extents, data);
+ cur_row->extents_count= row_extents;
+ row_extent_size= row_extents * ROW_EXTENT_SIZE;
+ if (cur_row->extents_buffer_length < row_extent_size &&
+ _ma_alloc_buffer(&cur_row->extents,
+ &cur_row->extents_buffer_length,
+ row_extent_size))
+ DBUG_RETURN(my_errno);
+ memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
+ data+= ROW_EXTENT_SIZE;
+ init_extent(&extent, cur_row->extents, row_extents,
+ cur_row->tail_positions);
+ }
+ else
+ {
+ cur_row->extents_count= 0;
+ (*cur_row->tail_positions)= 0;
+ extent.page_count= 0;
+ extent.extent_count= 1;
+ }
+ extent.first_extent= 1;
+
+ field_lengths= 0;
+ if (share->base.max_field_lengths)
+ {
+ get_key_length(field_lengths, data);
+ cur_row->field_lengths_length= field_lengths;
+#ifdef SANITY_CHECKS
+ if (field_lengths > share->base.max_field_lengths)
+ goto err;
+#endif
+ }
+
+ if (share->calc_checksum)
+ cur_row->checksum= (uint) (uchar) *data++;
+ /* data now points on null bits */
+ memcpy(record, data, cur_null_bytes);
+ if (unlikely(cur_null_bytes != null_bytes))
+ {
+ /*
+ This only happens if we have added more NULL columns with
+ ALTER TABLE and are fetching an old, not yet modified old row
+ */
+ bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
+ }
+ data+= null_bytes;
+ /* We copy the empty bits to be able to use them for delete/update */
+ memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
+ data+= share->base.pack_bytes;
+
+ /* TODO: Use field offsets, instead of just skipping them */
+ data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+ /*
+ Read row extents (note that first extent was already read into
+ cur_row->extents above)
+ */
+ if (row_extents > 1)
+ {
+ if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
+ (row_extents - 1) * ROW_EXTENT_SIZE,
+ &extent, &data, &end_of_data))
+ DBUG_RETURN(my_errno);
+ }
+
+ /*
+ Data now points to start of fixed length field data that can't be null
+ or 'empty'. Note that these fields can't be split over blocks.
+ */
+ for (column= share->columndef,
+ end_column= column + share->base.fixed_not_null_fields;
+ column < end_column; column++)
+ {
+ uint column_length= column->length;
+ if (data >= end_of_data &&
+ !(data= read_next_extent(info, &extent, &end_of_data)))
+ goto err;
+ memcpy(record + column->offset, data, column_length);
+ data+= column_length;
+ }
+
+ /* Read array of field lengths. This may be stored in several extents */
+ if (field_lengths)
+ {
+ field_length_data= cur_row->field_lengths;
+ if (read_long_data(info, field_length_data, field_lengths, &extent,
+ &data, &end_of_data))
+ DBUG_RETURN(my_errno);
+ }
+
+ /* Read variable length data. Each of these may be split over many extents */
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column; column++)
+ {
+ enum en_fieldtype type= column->type;
+ uchar *field_pos= record + column->offset;
+ /* First check if field is present in record */
+ if ((record[column->null_pos] & column->null_bit) ||
+ (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
+ {
+ bfill(record + column->offset, column->fill_length,
+ type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ switch (type) {
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_SKIP_PRESPACE:
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ if (data >= end_of_data &&
+ !(data= read_next_extent(info, &extent, &end_of_data)))
+ goto err;
+ memcpy(field_pos, data, column->length);
+ data+= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ /* Char that is space filled */
+ uint length;
+ if (column->length <= 255)
+ length= (uint) (uchar) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+#ifdef SANITY_CHECKS
+ if (length > column->length)
+ goto err;
+#endif
+ if (read_long_data(info, field_pos, length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ bfill(field_pos + length, column->length - length, ' ');
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ ulong length;
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) (*field_pos++= *field_length_data++);
+ }
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_pos[0]= field_length_data[0];
+ field_pos[1]= field_length_data[1];
+ field_pos+= 2;
+ field_length_data+= 2;
+ }
+ if (read_long_data(info, field_pos, length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
+
+ if (!found_blob)
+ {
+ /* Calculate total length for all blobs */
+ ulong blob_lengths= 0;
+ uchar *length_data= field_length_data;
+ MARIA_COLUMNDEF *blob_field= column;
+
+ found_blob= 1;
+ for (; blob_field < end_column; blob_field++)
+ {
+ uint size_length;
+ if ((record[blob_field->null_pos] & blob_field->null_bit) ||
+ (cur_row->empty_bits[blob_field->empty_pos] &
+ blob_field->empty_bit))
+ continue;
+ size_length= blob_field->length - portable_sizeof_char_ptr;
+ blob_lengths+= _ma_calc_blob_length(size_length, length_data);
+ length_data+= size_length;
+ }
+ cur_row->blob_length= blob_lengths;
+ DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ blob_lengths))
+ DBUG_RETURN(my_errno);
+ blob_buffer= info->rec_buff;
+ }
+
+ memcpy(field_pos, field_length_data, size_length);
+ memcpy_fixed(field_pos + size_length, (uchar *) & blob_buffer,
+ sizeof(char*));
+ field_length_data+= size_length;
+
+ /*
+ After we have read one extent, then each blob is in it's own extent
+ */
+ if (extent.first_extent && (ulong) (end_of_data - data) < blob_length)
+ end_of_data= data; /* Force read of next extent */
+
+ if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ blob_buffer+= blob_length;
+ break;
+ }
+ default:
+#ifdef EXTRA_DEBUG
+ DBUG_ASSERT(0); /* purecov: deadcode */
+#endif
+ goto err;
+ }
+ continue;
+ }
+
+ if (row_extents)
+ {
+ DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u",
+ extent.page_count, extent.extent_count));
+ *extent.tail_positions= 0; /* End marker */
+ if (extent.page_count)
+ goto err;
+ if (extent.extent_count > 1)
+ if (check_if_zero(extent.extent + ROW_EXTENT_SIZE,
+ (extent.extent_count-1) * ROW_EXTENT_SIZE))
+ goto err;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Row read"));
+ /*
+ data should normally point to end_of_date. The only exception is if
+ the row is very short in which case we allocated 'min_row_length' data
+ for allowing the row to expand.
+ */
+ if (data != end_of_data && (uint) (end_of_data - start_of_data) >
+ info->s->base.min_row_length)
+ goto err;
+ }
+
+ info->update|= HA_STATE_AKTIV; /* We have an active record */
+ DBUG_RETURN(0);
+
+err:
+ /* Something was wrong with data on record */
+ DBUG_PRINT("error", ("Found record with wrong data"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/** @brief Read positions to tail blocks and full blocks
+
+ @fn read_row_extent_info()
+ @param info Handler
+
+ @notes
+ This function is a simpler version of _ma_read_block_record2()
+ The data about the used pages is stored in info->cur_row.
+
+ @return Status
+ @retval 0 ok
+ @retval 1 Error. my_errno contains error number
+*/
+
+static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
+ uint record_number)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *data, *end_of_data;
+ uint flag, row_extents, field_lengths;
+ MARIA_EXTENT_CURSOR extent;
+ DBUG_ENTER("read_row_extent_info");
+
+ if (!(data= get_record_position(buff, share->block_size,
+ record_number, &end_of_data)))
+ DBUG_RETURN(1); /* Wrong in record */
+
+ flag= (uint) (uchar) data[0];
+ /* Skip trans header */
+ data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
+
+ row_extents= 0;
+ if (flag & ROW_FLAG_EXTENTS)
+ {
+ uint row_extent_size;
+ /*
+ Record is split over many data pages.
+ Get number of extents and first extent
+ */
+ get_key_length(row_extents, data);
+ row_extent_size= row_extents * ROW_EXTENT_SIZE;
+ if (info->cur_row.extents_buffer_length < row_extent_size &&
+ _ma_alloc_buffer(&info->cur_row.extents,
+ &info->cur_row.extents_buffer_length,
+ row_extent_size))
+ DBUG_RETURN(1);
+ memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
+ data+= ROW_EXTENT_SIZE;
+ init_extent(&extent, info->cur_row.extents, row_extents,
+ info->cur_row.tail_positions);
+ extent.first_extent= 1;
+ }
+ else
+ (*info->cur_row.tail_positions)= 0;
+ info->cur_row.extents_count= row_extents;
+
+ if (share->base.max_field_lengths)
+ get_key_length(field_lengths, data);
+
+ if (share->calc_checksum)
+ info->cur_row.checksum= (uint) (uchar) *data++;
+ if (row_extents > 1)
+ {
+ MARIA_RECORD_POS *tail_pos;
+ uchar *extents, *end;
+
+ data+= share->base.null_bytes;
+ data+= share->base.pack_bytes;
+ data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+ /*
+ Read row extents (note that first extent was already read into
+ info->cur_row.extents above)
+ Lock tails with write lock as we will delete them later.
+ */
+ extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
+ (row_extents - 1) * ROW_EXTENT_SIZE,
+ &extent, &data, &end_of_data))
+ DBUG_RETURN(1);
+
+ /* Update tail_positions with pointer to tails */
+ tail_pos= info->cur_row.tail_positions;
+ for (extents= info->cur_row.extents, end= extents+ row_extents;
+ extents < end;
+ extents += ROW_EXTENT_SIZE)
+ {
+ ulonglong page= uint5korr(extents);
+ uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+ if (page_count & TAIL_BIT)
+ *(tail_pos++)= ma_recordpos(page, (page_count & ~TAIL_BIT));
+ }
+ *tail_pos= 0; /* End marker */
+ }
+ DBUG_RETURN(0);
+}
+
+
+
+/*
+ Read a record based on record position
+
+ @fn _ma_read_block_record()
+ @param info Maria handler
+ @param record Store record here
+ @param record_pos Record position
+
+ @return Status
+ @retval 0 ok
+ @retval # Error number
+*/
+
+int _ma_read_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos)
+{
+ uchar *data, *end_of_data, *buff;
+ uint offset;
+ uint block_size= info->s->block_size;
+ DBUG_ENTER("_ma_read_block_record");
+ DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
+
+ offset= ma_recordpos_to_dir_entry(record_pos);
+
+ DBUG_ASSERT(info->s->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(info->s->pagecache,
+ &info->dfile, ma_recordpos_to_page(record_pos), 0,
+ info->buff, info->s->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE);
+ if (!(data= get_record_position(buff, block_size, offset, &end_of_data)))
+ {
+ DBUG_PRINT("error", ("Wrong directory entry in data block"));
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
+ }
+ DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+}
+
+
+/* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ uchar *org_rec_buff, *old_record;
+ size_t org_rec_buff_size;
+ int error;
+ DBUG_ENTER("_ma_cmp_block_unique");
+
+ if (!(old_record= my_alloca(info->s->base.reclength)))
+ DBUG_RETURN(1);
+
+ /* Don't let the compare destroy blobs that may be in use */
+ org_rec_buff= info->rec_buff;
+ org_rec_buff_size= info->rec_buff_size;
+ if (info->s->base.blobs)
+ {
+ /* Force realloc of record buffer*/
+ info->rec_buff= 0;
+ info->rec_buff_size= 0;
+ }
+ error= _ma_read_block_record(info, old_record, pos);
+ if (!error)
+ error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
+ if (info->s->base.blobs)
+ {
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->rec_buff= org_rec_buff;
+ info->rec_buff_size= org_rec_buff_size;
+ }
+ DBUG_PRINT("exit", ("result: %d", error));
+ my_afree(old_record);
+ DBUG_RETURN(error != 0);
+}
+
+
+/****************************************************************************
+ Table scan
+****************************************************************************/
+
+/*
+ Allocate buffers for table scan
+
+ SYNOPSIS
+ _ma_scan_init_block_record(MARIA_HA *info)
+
+ IMPLEMENTATION
+ We allocate one buffer for the current bitmap and one buffer for the
+ current page
+
+ RETURN
+ 0 ok
+ 1 error (couldn't allocate memory or disk error)
+*/
+
+my_bool _ma_scan_init_block_record(MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_scan_init_block_record");
+ /*
+ bitmap_buff may already be allocated if this is the second call to
+ rnd_init() without a rnd_end() in between, see sql/handler.h
+ */
+ if (!(info->scan.bitmap_buff ||
+ ((info->scan.bitmap_buff=
+ (uchar *) my_malloc(info->s->block_size * 2, MYF(MY_WME))))))
+ DBUG_RETURN(1);
+ info->scan.page_buff= info->scan.bitmap_buff + info->s->block_size;
+ info->scan.bitmap_end= info->scan.bitmap_buff + info->s->bitmap.total_size;
+
+ /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
+ info->scan.number_of_rows= 0;
+ info->scan.bitmap_pos= info->scan.bitmap_end;
+ info->scan.bitmap_page= (ulong) - (long) info->s->bitmap.pages_covered;
+ /*
+ We have to flush bitmap as we will read the bitmap from the page cache
+ while scanning rows
+ */
+ DBUG_RETURN(_ma_flush_bitmap(info->s));
+}
+
+
+/* Free buffers allocated by _ma_scan_block_init() */
+
+void _ma_scan_end_block_record(MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_scan_end_block_record");
+ my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->scan.bitmap_buff= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Read next record while scanning table
+
+ SYNOPSIS
+ _ma_scan_block_record()
+ info Maria handler
+ record Store found here
+ record_pos Value stored in info->cur_row.next_pos after last call
+ skip_deleted
+
+ NOTES
+ - One must have called mi_scan() before this
+ - In this version, we don't actually need record_pos, we as easily
+ use a variable in info->scan
+
+ IMPLEMENTATION
+ Current code uses a lot of goto's to separate the different kind of
+ states we may be in. This gives us a minimum of executed if's for
+ the normal cases. I tried several different ways to code this, but
+ the current one was in the end the most readable and fastest.
+
+ RETURN
+ 0 ok
+ # Error code
+*/
+
+int _ma_scan_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos,
+ my_bool skip_deleted __attribute__ ((unused)))
+{
+ uint block_size;
+ my_off_t filepos;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_scan_block_record");
+
+restart_record_read:
+ /* Find next row in current page */
+ if (likely(record_pos < info->scan.number_of_rows))
+ {
+ uint length, offset;
+ uchar *data, *end_of_data;
+
+ while (!(offset= uint2korr(info->scan.dir)))
+ {
+ info->scan.dir-= DIR_ENTRY_SIZE;
+ record_pos++;
+#ifdef SANITY_CHECKS
+ if (info->scan.dir < info->scan.dir_end)
+ goto err;
+#endif
+ }
+ /* found row */
+ info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+ info->cur_row.nextpos= record_pos + 1;
+ data= info->scan.page_buff + offset;
+ length= uint2korr(info->scan.dir + 2);
+ end_of_data= data + length;
+ info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */
+#ifdef SANITY_CHECKS
+ if (end_of_data > info->scan.dir_end ||
+ offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
+ goto err;
+#endif
+ DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+ DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+ }
+
+ /* Find next head page in current bitmap */
+restart_bitmap_scan:
+ block_size= share->block_size;
+ if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
+ {
+ uchar *data= info->scan.bitmap_pos;
+ longlong bits= info->scan.bits;
+ uint bit_pos= info->scan.bit_pos;
+
+ do
+ {
+ while (likely(bits))
+ {
+ uint pattern= bits & 7;
+ bits >>= 3;
+ bit_pos++;
+ if (pattern > 0 && pattern <= 4)
+ {
+ /* Found head page; Read it */
+ ulong page;
+ info->scan.bitmap_pos= data;
+ info->scan.bits= bits;
+ info->scan.bit_pos= bit_pos;
+ page= (info->scan.bitmap_page + 1 +
+ (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
+ info->scan.row_base_page= ma_recordpos(page, 0);
+ if (!(pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0, info->scan.page_buff,
+ share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
+ HEAD_PAGE) ||
+ (info->scan.number_of_rows=
+ (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
+ {
+ DBUG_PRINT("error", ("Wrong page header"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+ }
+ DBUG_PRINT("info", ("Page %lu has %u rows",
+ (ulong) page, info->scan.number_of_rows));
+ info->scan.dir= (info->scan.page_buff + block_size -
+ PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+ info->scan.dir_end= (info->scan.dir -
+ (info->scan.number_of_rows - 1) *
+ DIR_ENTRY_SIZE);
+ record_pos= 0;
+ goto restart_record_read;
+ }
+ }
+ for (data+= 6; data < info->scan.bitmap_end; data+= 6)
+ {
+ bits= uint6korr(data);
+ /* Skip not allocated pages and blob / full tail pages */
+ if (bits && bits != LL(07777777777777777))
+ break;
+ }
+ bit_pos= 0;
+ } while (data < info->scan.bitmap_end);
+ }
+
+ /* Read next bitmap */
+ info->scan.bitmap_page+= share->bitmap.pages_covered;
+ filepos= (my_off_t) info->scan.bitmap_page * block_size;
+ if (unlikely(filepos >= info->state->data_file_length))
+ {
+ DBUG_PRINT("info", ("Found end of file"));
+ DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
+ }
+ DBUG_PRINT("info", ("Reading bitmap at %lu",
+ (ulong) info->scan.bitmap_page));
+ if (!(pagecache_read(share->pagecache, &info->dfile,
+ info->scan.bitmap_page,
+ 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ /* Skip scanning 'bits' in bitmap scan code */
+ info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
+ info->scan.bits= 0;
+ goto restart_bitmap_scan;
+
+err:
+ DBUG_PRINT("error", ("Wrong data on page"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/*
+ Compare a row against a stored one
+
+ NOTES
+ Not implemented, as block record is not supposed to be used in a shared
+ global environment
+*/
+
+my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
+ const uchar *record __attribute__ ((unused)))
+{
+ return 0;
+}
+
+
+#ifndef DBUG_OFF
+
+static void _ma_print_directory(uchar *buff, uint block_size)
+{
+ uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
+ uint end_of_prev_row= PAGE_HEADER_SIZE;
+ uchar *dir, *end;
+
+ dir= buff + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
+ end= buff + block_size - DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE;
+
+ DBUG_LOCK_FILE;
+ fprintf(DBUG_FILE,"Directory dump (pos:length):\n");
+
+ for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
+ {
+ uint offset= uint2korr(end);
+ uint length= uint2korr(end+2);
+ fprintf(DBUG_FILE, " %4u:%4u", offset, offset ? length : 0);
+ if (!(row % (80/12)))
+ fputc('\n', DBUG_FILE);
+ if (offset)
+ {
+ DBUG_ASSERT(offset >= end_of_prev_row);
+ end_of_prev_row= offset + length;
+ }
+ }
+ fputc('\n', DBUG_FILE);
+ fflush(DBUG_FILE);
+ DBUG_UNLOCK_FILE;
+}
+#endif /* DBUG_OFF */
+
+
+/*
+ Store an integer with simple packing
+
+ SYNOPSIS
+ ma_store_integer()
+ to Store the packed integer here
+ nr Integer to store
+
+ NOTES
+ This is mostly used to store field numbers and lengths of strings.
+ We have to cast the result for the LL() becasue of a bug in Forte CC
+ compiler.
+
+ Packing used is:
+ nr < 251 is stored as is (in 1 byte)
+ Numbers that require 1-4 bytes are stored as char(250+byte_length), data
+ Bigger numbers are stored as 255, data as ulonglong (not yet done).
+
+ RETURN
+ Position in 'to' after the packed length
+*/
+
+uchar *ma_store_length(uchar *to, ulong nr)
+{
+ if (nr < 251)
+ {
+ *to=(uchar) nr;
+ return to+1;
+ }
+ if (nr < 65536)
+ {
+ if (nr <= 255)
+ {
+ to[0]= (uchar) 251;
+ to[1]= (uchar) nr;
+ return to+2;
+ }
+ to[0]= (uchar) 252;
+ int2store(to+1, nr);
+ return to+3;
+ }
+ if (nr < 16777216)
+ {
+ *to++= (uchar) 253;
+ int3store(to, nr);
+ return to+3;
+ }
+ *to++= (uchar) 254;
+ int4store(to, nr);
+ return to+4;
+}
+
+
+/* Calculate how many bytes needed to store a number */
+
+uint ma_calc_length_for_store_length(ulong nr)
+{
+ if (nr < 251)
+ return 1;
+ if (nr < 65536)
+ {
+ if (nr <= 255)
+ return 2;
+ return 3;
+ }
+ if (nr < 16777216)
+ return 4;
+ return 5;
+}
+
+
+/* Retrive a stored number */
+
+static ulong ma_get_length(uchar **packet)
+{
+ reg1 uchar *pos= *packet;
+ if (*pos < 251)
+ {
+ (*packet)++;
+ return (ulong) *pos;
+ }
+ if (*pos == 251)
+ {
+ (*packet)+= 2;
+ return (ulong) pos[1];
+ }
+ if (*pos == 252)
+ {
+ (*packet)+= 3;
+ return (ulong) uint2korr(pos+1);
+ }
+ if (*pos == 253)
+ {
+ (*packet)+= 4;
+ return (ulong) uint3korr(pos+1);
+ }
+ DBUG_ASSERT(*pos == 254);
+ (*packet)+= 5;
+ return (ulong) uint4korr(pos+1);
+}
+
+
+/*
+ Fill array with pointers to field parts to be stored in log for insert
+
+ SYNOPSIS
+ fill_insert_undo_parts()
+ info Maria handler
+ record Inserted row
+ log_parts Store pointers to changed memory areas here
+ log_parts_count See RETURN
+
+ NOTES
+ We have information in info->cur_row about the read row.
+
+ RETURN
+ length of data in log_parts.
+ log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+ LEX_STRING *log_parts,
+ uint *log_parts_count)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ uchar *field_lengths= info->cur_row.field_lengths;
+ size_t row_length;
+ MARIA_ROW *cur_row= &info->cur_row;
+ LEX_STRING *start_log_parts;
+ DBUG_ENTER("fill_insert_undo_parts");
+
+ start_log_parts= log_parts;
+
+ /* Store null bits */
+ log_parts->str= (char*) record;
+ log_parts->length= share->base.null_bytes;
+ row_length= log_parts->length;
+ log_parts++;
+
+ /* Stored bitmap over packed (zero length or all-zero fields) */
+ log_parts->str= info->cur_row.empty_bits;
+ log_parts->length= share->base.pack_bytes;
+ row_length+= log_parts->length;
+ log_parts++;
+
+ if (share->base.max_field_lengths)
+ {
+ /* Store length of all not empty char, varchar and blob fields */
+ log_parts->str= field_lengths-2;
+ log_parts->length= info->cur_row.field_lengths_length+2;
+ int2store(log_parts->str, info->cur_row.field_lengths_length);
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ if (share->base.blobs)
+ {
+ /* Store total blob length to make buffer allocation easier during undo */
+ log_parts->str= info->length_buff;
+ log_parts->length= (uint) (ma_store_length(log_parts->str,
+ info->cur_row.blob_length) -
+ (uchar*) log_parts->str);
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle constant length fields that are always present */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ log_parts->str= (char*) record + column->offset;
+ log_parts->length= column->length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle NULL fields and CHAR/VARCHAR fields */
+ for (end_column= share->columndef + share->base.fields - share->base.blobs;
+ column < end_column;
+ column++)
+ {
+ const uchar *column_pos;
+ size_t column_length;
+ if ((record[column->null_pos] & column->null_bit) ||
+ cur_row->empty_bits[column->empty_pos] & column->empty_bit)
+ continue;
+
+ column_pos= record+ column->offset;
+ column_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ if (column->length <= 255)
+ column_length= *field_lengths++;
+ else
+ {
+ column_length= uint2korr(field_lengths);
+ field_lengths+= 2;
+ }
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ if (column->fill_length == 1)
+ column_length= *field_lengths;
+ else
+ column_length= uint2korr(field_lengths);
+ field_lengths+= column->fill_length;
+ column_pos+= column->fill_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ log_parts->str= (char*) column_pos;
+ log_parts->length= column_length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Add blobs */
+ for (end_column+= share->base.blobs; column < end_column; column++)
+ {
+ const uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+ /*
+ We don't have to check for null, as blob_length is guranteed to be 0
+ if the blob is null
+ */
+ if (blob_length)
+ {
+ char *blob_pos;
+ memcpy_fixed((uchar*) &blob_pos, record + column->offset + size_length,
+ sizeof(blob_pos));
+ log_parts->str= blob_pos;
+ log_parts->length= blob_length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+ }
+ *log_parts_count= (log_parts - start_log_parts);
+ DBUG_RETURN(row_length);
+}
+
+
+/*
+ Fill array with pointers to field parts to be stored in log for update
+
+ SYNOPSIS
+ fill_update_undo_parts()
+ info Maria handler
+ oldrec Original row
+ newrec New row
+ log_parts Store pointers to changed memory areas here
+ log_parts_count See RETURN
+
+ IMPLEMENTATION
+ Format of undo record:
+
+ Fields are stored in same order as the field array.
+
+ Offset to changed field data (packed)
+
+ For each changed field
+ Fieldnumber (packed)
+ Length, if variable length field (packed)
+
+ For each changed field
+ Data
+
+ Packing is using ma_store_integer()
+
+ The reason we store field numbers & length separated from data (ie, not
+ after each other) is to get better cpu caching when we loop over
+ fields (as we probably don't have to access data for each field when we
+ want to read and old row through the undo log record).
+
+ As a special case, we use '255' for the field number of the null bitmap.
+
+ RETURN
+ length of data in log_parts.
+ log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+ const uchar *newrec,
+ LEX_STRING *log_parts,
+ uint *log_parts_count)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
+ uchar *field_data, *start_field_data;
+ uchar *old_field_lengths= old_row->field_lengths;
+ uchar *new_field_lengths= new_row->field_lengths;
+ size_t row_length= 0;
+ uint field_lengths;
+ LEX_STRING *start_log_parts;
+ my_bool new_column_is_empty;
+ DBUG_ENTER("fill_update_undo_parts");
+
+ start_log_parts= log_parts;
+
+ /*
+ First log part is for number of fields, field numbers and lengths
+ The +4 is to reserve place for the number of changed fields.
+ */
+ start_field_data= field_data= info->update_field_data + 4;
+ log_parts++;
+
+ if (memcmp(oldrec, newrec, share->base.null_bytes))
+ {
+ /* Store changed null bits */
+ *field_data++= (uchar) 255; /* Special case */
+ log_parts->str= (char*) oldrec;
+ log_parts->length= share->base.null_bytes;
+ row_length= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle constant length fields */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ if (memcmp(oldrec + column->offset, newrec + column->offset,
+ column->length))
+ {
+ field_data= ma_store_length(field_data,
+ (uint) (column - share->columndef));
+ log_parts->str= (char*) oldrec + column->offset;
+ log_parts->length= column->length;
+ row_length+= column->length;
+ log_parts++;
+ }
+ }
+
+ /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column;
+ column++)
+ {
+ const uchar *new_column_pos, *old_column_pos;
+ size_t new_column_length, old_column_length;
+
+ /* First check if old column is null or empty */
+ if (oldrec[column->null_pos] & column->null_bit)
+ {
+ /*
+ It's safe to skip this one as either the new column is also null
+ (no change) or the new_column is not null, in which case the null-bit
+ maps differed and we have already stored the null bitmap.
+ */
+ continue;
+ }
+ if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
+ {
+ if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
+ continue; /* Both are empty; skip */
+
+ /* Store null length column */
+ field_data= ma_store_length(field_data,
+ (uint) (column - share->columndef));
+ field_data= ma_store_length(field_data, 0);
+ continue;
+ }
+ /*
+ Remember if the 'new' value is empty (as in this case we must always
+ log the original value
+ */
+ new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
+ (new_row->empty_bits[column->empty_pos] &
+ column->empty_bit));
+
+ old_column_pos= oldrec + column->offset;
+ new_column_pos= newrec + column->offset;
+ old_column_length= new_column_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ break;
+ case FIELD_VARCHAR:
+ new_column_length--; /* Skip length prefix */
+ old_column_pos+= column->fill_length;
+ new_column_pos+= column->fill_length;
+ /* Fall through */
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ if (new_column_length <= 255)
+ {
+ old_column_length= *old_field_lengths++;
+ if (!new_column_is_empty)
+ new_column_length= *new_field_lengths++;
+ }
+ else
+ {
+ old_column_length= uint2korr(old_field_lengths);
+ old_field_lengths+= 2;
+ if (!new_column_is_empty)
+ {
+ new_column_length= uint2korr(new_field_lengths);
+ new_field_lengths+= 2;
+ }
+ }
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
+ memcpy_fixed((uchar*) &old_column_pos,
+ oldrec + column->offset + size_length,
+ sizeof(old_column_pos));
+ if (!new_column_is_empty)
+ {
+ new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
+ memcpy_fixed((uchar*) &new_column_pos,
+ newrec + column->offset + size_length,
+ sizeof(old_column_pos));
+ }
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+
+ if (new_column_is_empty || new_column_length != old_column_length ||
+ memcmp(old_column_pos, new_column_pos, new_column_length))
+ {
+ field_data= ma_store_length(field_data,
+ (uint) (column - share->columndef));
+ field_data= ma_store_length(field_data, old_column_length);
+
+ log_parts->str= (char*) old_column_pos;
+ log_parts->length= old_column_length;
+ row_length+= old_column_length;
+ log_parts++;
+ }
+ }
+
+ *log_parts_count= (log_parts - start_log_parts);
+
+ /* Store length of field length data before the field/field_lengths */
+ field_lengths= (field_data - start_field_data);
+ start_log_parts->str= ((char*)
+ (start_field_data -
+ ma_calc_length_for_store_length(field_lengths)));
+ ma_store_length(start_log_parts->str, field_lengths);
+ start_log_parts->length= (size_t) ((char*) field_data -
+ start_log_parts->str);
+ row_length+= start_log_parts->length;
+ DBUG_RETURN(row_length);
+}
+
+/***************************************************************************
+ Applying of REDO log records
+***************************************************************************/
+
+/*
+ Apply LOGREC_REDO_INSERT_ROW_HEAD & LOGREC_REDO_INSERT_ROW_TAIL
+
+ SYNOPSIS
+ _ma_apply_redo_insert_row_head_or_tail()
+ info Maria handler
+ lsn LSN to put on page
+ page_type HEAD_PAGE or TAIL_PAGE
+ header Header (without FILEID)
+ data Data to be put on page
+ data_length Length of data
+
+ RETURN
+ 0 ok
+ # Error number
+*/
+
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header,
+ const uchar *data,
+ size_t data_length)
+{
+ MARIA_SHARE *share= info->s;
+ ulonglong page;
+ uint rownr, empty_space;
+ uint block_size= share->block_size;
+ uint rec_offset;
+ uchar *buff= info->keyread_buff, *dir;
+ DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
+
+ info->keyread_buff_used= 1;
+ page= page_korr(header);
+ rownr= dirpos_korr(header+PAGE_STORE_SIZE);
+
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr, (uint) data_length));
+
+ if (((page + 1) * info->s->block_size) > info->state->data_file_length)
+ {
+ /*
+ New page at end of file. Note that the test above is also positive if
+ data_file_length is not a multiple of block_size (system crashed while
+ writing the last page): in this case we just extend the last page and
+ fill it entirely with zeroes, then the REDO will put correct data on
+ it.
+ */
+ DBUG_ASSERT(rownr == 0);
+ if (rownr != 0)
+ goto err;
+ make_empty_page(buff, block_size, page_type);
+ empty_space= (block_size - PAGE_OVERHEAD_SIZE);
+ rec_offset= PAGE_HEADER_SIZE;
+ dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+ }
+ else
+ {
+ uint max_entry;
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+
+ /* Fix bitmap, just in case */
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ DBUG_RETURN(my_errno);
+ DBUG_RETURN(0);
+ }
+
+ max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET];
+ if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
+ {
+ /*
+ This is a page that has been freed before and now should be
+ changed to new type.
+ */
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE &&
+ (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != UNALLOCATED_PAGE)
+ goto err;
+ make_empty_page(buff, block_size, page_type);
+ empty_space= (block_size - PAGE_OVERHEAD_SIZE);
+ rec_offset= PAGE_HEADER_SIZE;
+ dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+ }
+ else
+ {
+ dir= (buff + block_size - DIR_ENTRY_SIZE * (rownr + 1) -
+ PAGE_SUFFIX_SIZE);
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+ if (max_entry <= rownr)
+ {
+ /* Add directory entry first in directory and data last on page */
+ DBUG_ASSERT(max_entry == rownr);
+ if (max_entry != rownr)
+ goto err;
+ rec_offset= (uint2korr(dir + DIR_ENTRY_SIZE) +
+ uint2korr(dir + DIR_ENTRY_SIZE +2));
+ if ((uint) (dir - buff) < rec_offset + data_length)
+ {
+ /* Create place for directory & data */
+ compact_page(buff, block_size, max_entry - 1, 0);
+ rec_offset= (uint2korr(dir + DIR_ENTRY_SIZE) +
+ uint2korr(dir + DIR_ENTRY_SIZE +2));
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ DBUG_ASSERT(!((uint) (dir - buff) < rec_offset + data_length));
+ if ((uint) (dir - buff) < rec_offset + data_length)
+ goto err;
+ }
+ buff[DIR_COUNT_OFFSET]= (uchar) max_entry+1;
+ int2store(dir, rec_offset);
+ empty_space-= DIR_ENTRY_SIZE;
+ }
+ else
+ {
+ uint length;
+ /*
+ Reuse old entry. This is empty if the command was an insert and
+ possible used if the command was an update.
+ */
+ if (extend_area_on_page(buff, dir, rownr, block_size,
+ data_length, &empty_space,
+ &rec_offset, &length))
+ goto err;
+ }
+ }
+ }
+ /* Copy data */
+ int2store(dir+2, data_length);
+ memcpy(buff + rec_offset, data, data_length);
+ empty_space-= data_length;
+ int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+
+ /* Write modified page */
+ lsn_store(buff, lsn);
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0))
+ DBUG_RETURN(my_errno);
+
+ /* Fix bitmap */
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ DBUG_RETURN(my_errno);
+
+ /*
+ Data page and bitmap page are in place, we can update data_file_length in
+ case we extended the file. We could not do it earlier: bitmap code tests
+ data_file_length to know if it has to create a new page or not.
+ */
+ {
+ my_off_t end_of_page= (page + 1) * info->s->block_size;
+ set_if_bigger(info->state->data_file_length, end_of_page);
+ }
+
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
+}
+
+
+/*
+ Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
+
+ SYNOPSIS
+ _ma_apply_redo_purge_row_head_or_tail()
+ info Maria handler
+ lsn LSN to put on page
+ page_type HEAD_PAGE or TAIL_PAGE
+ header Header (without FILEID)
+
+ NOTES
+ This function is very similar to delete_head_or_tail()
+
+ RETURN
+ 0 ok
+ # Error number
+*/
+
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ ulonglong page;
+ uint rownr, empty_space;
+ uint block_size= share->block_size;
+ uchar *buff= info->keyread_buff;
+ DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
+
+ page= page_korr(header);
+ rownr= dirpos_korr(header+PAGE_STORE_SIZE);
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr));
+
+ info->keyread_buff_used= 1;
+
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+
+ if (lsn_korr(buff) >= lsn)
+ {
+ /*
+ Already applied
+ Note that in case the page is not anymore a head or tail page
+ a future redo will fix the bitmap.
+ */
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
+ {
+ empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
+ empty_space))
+ DBUG_RETURN(my_errno);
+ }
+ DBUG_RETURN(0);
+ }
+
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
+
+ if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0)
+ DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
+
+ lsn_store(buff, lsn);
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0))
+ DBUG_RETURN(my_errno);
+
+ /* This will work even if the page was marked as UNALLOCATED_PAGE */
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ DBUG_RETURN(my_errno);
+
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_PURGE_BLOCKS
+
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @note It marks the page free in the bitmap, and sets the directory's count
+ to 0.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+uint _ma_apply_redo_purge_blocks(MARIA_HA *info,
+ LSN lsn, const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ ulonglong page;
+ uint page_range, ranges;
+ uint res= 0;
+ uchar *buff= info->keyread_buff;
+ DBUG_ENTER("_ma_apply_redo_purge_blocks");
+
+ info->keyread_buff_used= 1;
+ ranges= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+
+ while (ranges--)
+ {
+ uint i;
+ page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ page_range= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+
+ for (i= 0; i < page_range ; i++)
+ {
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page+i, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ continue;
+ }
+ buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+ lsn_store(buff, lsn);
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page+i, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0))
+ DBUG_RETURN(my_errno);
+ }
+ /** @todo leave bitmap lock to the bitmap code... */
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ res= _ma_reset_full_page_bits(info, &share->bitmap, page, page_range);
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ if (res)
+ DBUG_RETURN(res);
+ }
+ DBUG_RETURN(0);
+}
+
+/****************************************************************************
+ Applying of UNDO entries
+****************************************************************************/
+
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header)
+{
+ ulonglong page;
+ uint rownr;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1], *buff;
+ my_bool res= 1;
+ MARIA_PINNED_PAGE page_link;
+ LSN lsn;
+ DBUG_ENTER("_ma_apply_undo_row_insert");
+
+ page= page_korr(header);
+ rownr= dirpos_korr(header + PAGE_STORE_SIZE);
+ DBUG_PRINT("enter", ("Page: %lu rownr: %u", (ulong) page, rownr));
+
+ if (!(buff= pagecache_read(info->s->pagecache,
+ &info->dfile, page, 0,
+ info->buff, info->s->page_type,
+ PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ DBUG_RETURN(1);
+
+
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ if (read_row_extent_info(info, buff, rownr))
+ DBUG_RETURN(1);
+
+ if (delete_head_or_tail(info, page, rownr, 1, 1) ||
+ delete_tails(info, info->cur_row.tail_positions))
+ goto err;
+
+ if (info->cur_row.extents && free_full_pages(info, &info->cur_row))
+ goto err;
+
+ /* undo_lsn must be first for compression to work */
+ lsn_store(log_data, undo_lsn);
+ log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= LOGREC_UNDO_ROW_INSERT;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (translog_write_record(&lsn, LOGREC_CLR_END,
+ info->trn, info, sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE))
+ goto err;
+
+ res= 0;
+err:
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+}
+
+
+/* Execute undo of a row delete (insert the row back somewhere) */
+
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t length)
+{
+ uchar *record;
+ const uchar *null_bits, *field_length_data;
+ MARIA_SHARE *share= info->s;
+ MARIA_ROW row;
+ uint *null_field_lengths;
+ ulong *blob_lengths;
+ MARIA_COLUMNDEF *column, *end_column;
+ my_bool res;
+ DBUG_ENTER("_ma_apply_undo_row_delete");
+
+ /*
+ Use cur row as a base; We need to make a copy as we will change
+ some buffers to point directly to 'header'
+ */
+ memcpy(&row, &info->cur_row, sizeof(row));
+ null_field_lengths= row.null_field_lengths;
+ blob_lengths= row.blob_lengths;
+
+ /*
+ Fill in info->cur_row with information about the row, like in
+ calc_record_size(), to be used by write_block_record()
+ */
+
+ row.normal_length= row.char_length= row.varchar_length=
+ row.blob_length= row.extents_count= row.field_lengths_length= 0;
+
+ null_bits= header;
+ header+= share->base.null_bytes;
+ row.empty_bits= (uchar*) header;
+ header+= share->base.pack_bytes;
+ if (share->base.max_field_lengths)
+ {
+ row.field_lengths_length= uint2korr(header);
+ row.field_lengths= (uchar*) header + 2 ;
+ header+= 2 + row.field_lengths_length;
+ }
+ if (share->base.blobs)
+ row.blob_length= ma_get_length((uchar**) &header);
+
+ /* We need to build up a record (without blobs) in rec_buff */
+ if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
+ DBUG_RETURN(1);
+
+ memcpy(record, null_bits, share->base.null_bytes);
+
+ /* Copy field information from header to record */
+
+ /* Handle constant length fields that are always present */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ memcpy(record + column->offset, header, column->length);
+ header+= column->length;
+ }
+
+ /* Handle NULL fields and CHAR/VARCHAR fields */
+ field_length_data= row.field_lengths;
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column;
+ column++, null_field_lengths++)
+ {
+ if ((record[column->null_pos] & column->null_bit) ||
+ row.empty_bits[column->empty_pos] & column->empty_bit)
+ {
+ if (column->type != FIELD_BLOB)
+ *null_field_lengths= 0;
+ else
+ *blob_lengths++= 0;
+ if (share->calc_checksum)
+ bfill(record + column->offset, column->fill_length,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ row.normal_length+= column->length;
+ *null_field_lengths= column->length;
+ memcpy(record + column->offset, header, column->length);
+ header+= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ if (column->length <= 255)
+ length= (uint) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+ row.char_length+= length;
+ *null_field_lengths= length;
+ memcpy(record + column->offset, header, length);
+ if (share->calc_checksum)
+ bfill(record + column->offset + length, (column->length - length),
+ ' ');
+ header+= length;
+ break;
+ case FIELD_VARCHAR:
+ {
+ uint length;
+ uchar *field_pos= record + column->offset;
+
+ /* 256 is correct as this includes the length uchar */
+ if (column->fill_length == 1)
+ {
+ field_pos[0]= *field_length_data;
+ length= (uint) *field_length_data;
+ }
+ else
+ {
+ field_pos[0]= field_length_data[0];
+ field_pos[1]= field_length_data[1];
+ length= uint2korr(field_length_data);
+ }
+ field_length_data+= column->fill_length;
+ field_pos+= column->fill_length;
+ row.varchar_length+= length;
+ *null_field_lengths= length;
+ memcpy(field_pos, header, length);
+ header+= length;
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ /* Copy length of blob and pointer to blob data to record */
+ uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
+
+ memcpy(field_pos, field_length_data, size_length);
+ field_length_data+= size_length;
+ memcpy(field_pos + size_length, &header, sizeof(&header));
+ header+= blob_length;
+ *blob_lengths++= blob_length;
+ row.blob_length+= blob_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ row.head_length= (row.base_length +
+ share->base.fixed_not_null_fields_length +
+ row.field_lengths_length +
+ size_to_store_key_length(row.field_lengths_length) +
+ row.normal_length +
+ row.char_length + row.varchar_length);
+ row.total_length= (row.head_length + row.blob_length);
+ if (row.total_length < share->base.min_row_length)
+ row.total_length= share->base.min_row_length;
+
+ /* Row is now up to date. Time to insert the record */
+
+ res= allocate_and_write_block_record(info, record, &row, undo_lsn);
+ my_free(record, MYF(0));
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Execute undo of a row update
+
+ @fn _ma_apply_undo_row_update()
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header,
+ size_t header_length __attribute__((unused)))
+{
+ ulonglong page;
+ uint rownr, field_length_header;
+ MARIA_SHARE *share= info->s;
+ const uchar *field_length_data, *field_length_data_end;
+ uchar *current_record, *orig_record;
+ int error= 1;
+ MARIA_RECORD_POS record_pos;
+ DBUG_ENTER("_ma_apply_undo_row_update");
+
+ page= page_korr(header);
+ rownr= dirpos_korr(header + PAGE_STORE_SIZE);
+ record_pos= ma_recordpos(page, rownr);
+ DBUG_PRINT("enter", ("Page: %lu rownr: %u", (ulong) page, rownr));
+
+ /*
+ Set header to point to old field values, generated by
+ fill_update_undo_parts()
+ */
+ header+= PAGE_STORE_SIZE + DIRPOS_STORE_SIZE;
+ field_length_header= ma_get_length((uchar**) &header);
+ field_length_data= header;
+ header+= field_length_header;
+ field_length_data_end= header;
+
+ /* Allocate buffer for current row & original row */
+ if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
+ DBUG_RETURN(1);
+ orig_record= current_record+ share->base.reclength;
+
+ /* Read current record */
+ if (_ma_read_block_record(info, current_record, record_pos))
+ goto err;
+
+ if (*field_length_data == 255)
+ {
+ /* Bitmap changed */
+ field_length_data++;
+ memcpy(orig_record, header, share->base.null_bytes);
+ header+= share->base.null_bytes;
+ }
+ else
+ memcpy(orig_record, current_record, share->base.null_bytes);
+ bitmap_clear_all(&info->changed_fields);
+
+ while (field_length_data < field_length_data_end)
+ {
+ uint field_nr= ma_get_length((uchar**) &field_length_data), field_length;
+ MARIA_COLUMNDEF *column= share->columndef + field_nr;
+ uchar *orig_field_pos= orig_record + column->offset;
+
+ bitmap_set_bit(&info->changed_fields, field_nr);
+ if (field_nr >= share->base.fixed_not_null_fields)
+ {
+ if (!(field_length= ma_get_length((uchar**) &field_length_data)))
+ {
+ /* Null field or empty field */
+ bfill(orig_field_pos, column->fill_length,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ }
+ else
+ field_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ memcpy(orig_field_pos, header, column->length);
+ header+= column->length;
+ break;
+ case FIELD_SKIP_ZERO: /* Number */
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ uint diff;
+ memcpy(orig_field_pos, header, field_length);
+ if ((diff= (column->length - field_length)))
+ bfill(orig_field_pos + column->length - diff, diff,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ header+= field_length;
+ }
+ break;
+ case FIELD_VARCHAR:
+ if (column->length <= 256)
+ {
+ *orig_field_pos++= (uchar) field_length;
+ }
+ else
+ {
+ int2store(orig_field_pos, field_length);
+ orig_field_pos+= 2;
+ }
+ memcpy(orig_field_pos, header, field_length);
+ header+= field_length;
+ break;
+ case FIELD_BLOB:
+ {
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ _ma_store_blob_length(orig_field_pos, size_length, field_length);
+ memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header));
+ header+= field_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ copy_not_changed_fields(info, &info->changed_fields,
+ orig_record, current_record);
+
+ if (share->calc_checksum)
+ {
+ info->cur_row.checksum= (*share->calc_checksum)(info, orig_record);
+ info->state->checksum+= (info->cur_row.checksum -
+ (*share->calc_checksum)(info, current_record));
+ }
+
+ /*
+ Now records are up to date, execute the update to original values
+ */
+ if (_ma_update_block_record2(info, record_pos, current_record, orig_record,
+ undo_lsn))
+ goto err;
+
+ error= 0;
+err:
+ my_free(current_record, MYF(0));
+ DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h
new file mode 100644
index 00000000000..30dffe1c0c0
--- /dev/null
+++ b/storage/maria/ma_blockrec.h
@@ -0,0 +1,195 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Storage of records in block
+*/
+
+#define LSN_SIZE 7
+#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */
+#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */
+#define PAGE_TYPE_SIZE 1
+#define PAGE_SUFFIX_SIZE 0 /* Bytes for page suffix */
+#define PAGE_HEADER_SIZE (LSN_SIZE + DIR_COUNT_SIZE + EMPTY_SPACE_SIZE +\
+ PAGE_TYPE_SIZE)
+#define PAGE_OVERHEAD_SIZE (PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \
+ PAGE_SUFFIX_SIZE)
+#define BLOCK_RECORD_POINTER_SIZE 6
+
+#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - PAGE_TYPE_SIZE)
+
+#define ROW_EXTENT_PAGE_SIZE 5
+#define ROW_EXTENT_COUNT_SIZE 2
+#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE)
+#define TAIL_BIT 0x8000 /* Bit in page_count to signify tail */
+/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */
+#define ELEMENTS_RESERVED_FOR_MAIN_PART 4
+/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */
+#define EXTRA_LENGTH_FIELDS 3
+
+/* Size for the different parts in the row header (and head page) */
+
+#define FLAG_SIZE 1
+#define TRANSID_SIZE 6
+#define VERPTR_SIZE 7
+#define DIR_ENTRY_SIZE 4
+#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */
+
+/* Minimum header size needed for a new row */
+#define BASE_ROW_HEADER_SIZE FLAG_SIZE
+#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE
+
+#define PAGE_TYPE_MASK 127
+enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE };
+
+#define PAGE_TYPE_OFFSET LSN_SIZE
+#define DIR_COUNT_OFFSET LSN_SIZE+PAGE_TYPE_SIZE
+#define EMPTY_SPACE_OFFSET (DIR_COUNT_OFFSET + DIR_COUNT_SIZE)
+
+#define PAGE_CAN_BE_COMPACTED 128 /* Bit in PAGE_TYPE */
+
+/* Bits used for flag uchar (one byte, first in record) */
+#define ROW_FLAG_TRANSID 1
+#define ROW_FLAG_VER_PTR 2
+#define ROW_FLAG_DELETE_TRANSID 4
+#define ROW_FLAG_NULLS_EXTENDED 8
+#define ROW_FLAG_EXTENTS 128
+#define ROW_FLAG_ALL (1+2+4+8+128)
+
+/******** Variables that affects how data pages are utilized ********/
+
+/* Minium size of tail segment */
+#define MIN_TAIL_SIZE 32
+
+/*
+ Fixed length part of Max possible header size; See row data structure
+ table in ma_blockrec.c.
+*/
+#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3)
+#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \
+ TRANSID_SIZE + VERPTR_SIZE + \
+ TRANSID_SIZE)
+
+/* We use 1 uchar in record header to store number of directory entries */
+#define MAX_ROWS_PER_PAGE 255
+
+/* Bits for MARIA_BITMAP_BLOCKS->used */
+/* We stored data on disk in the block */
+#define BLOCKUSED_USED 1
+/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */
+#define BLOCKUSED_USE_ORG_BITMAP 2
+/* We stored tail data on disk for the block */
+#define BLOCKUSED_TAIL 4
+
+/******* defines that affects allocation (density) of data *******/
+
+/*
+ If the tail part (from the main block or a blob) would use more than 75 % of
+ the size of page, store the tail on a full page instead of a shared
+ tail page.
+*/
+#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4)
+
+/* Don't allocate memory for too many row extents on the stack */
+#define ROW_EXTENTS_ON_STACK 32
+
+/* Functions to convert MARIA_RECORD_POS to/from page:offset */
+
+static inline MARIA_RECORD_POS ma_recordpos(ulonglong page, uint dir_entry)
+{
+ DBUG_ASSERT(dir_entry <= 255);
+ return (MARIA_RECORD_POS) ((page << 8) | dir_entry);
+}
+
+static inline my_off_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos)
+{
+ return record_pos >> 8;
+}
+
+static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos)
+{
+ return (uint) (record_pos & 255);
+}
+
+/* ma_blockrec.c */
+void _ma_init_block_record_data(void);
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile);
+my_bool _ma_once_end_block_record(MARIA_SHARE *share);
+my_bool _ma_init_block_record(MARIA_HA *info);
+void _ma_end_block_record(MARIA_HA *info);
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec, const uchar *newrec);
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record);
+int _ma_read_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos);
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+ uchar *data, uchar *end_of_data);
+int _ma_scan_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS, my_bool);
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_scan_init_block_record(MARIA_HA *info);
+void _ma_scan_end_block_record(MARIA_HA *info);
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+ const uchar *record);
+my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_block_record(MARIA_HA *info);
+my_bool _ma_compare_block_record(register MARIA_HA *info,
+ register const uchar *record);
+
+/* ma_bitmap.c */
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file);
+my_bool _ma_bitmap_end(MARIA_SHARE *share);
+my_bool _ma_flush_bitmap(MARIA_SHARE *share);
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks);
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+ uint count);
+my_bool _ma_bitmap_set(MARIA_HA *info, ulonglong pos, my_bool head,
+ uint empty_space);
+my_bool _ma_reset_full_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ ulonglong page, uint page_count);
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size);
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row,
+ ulonglong page, uint free_size,
+ MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+ enum en_page_type page_type, ulonglong page,
+ uint empty_space, uint *bitmap_pattern);
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+ enum en_page_type page_type,
+ ulonglong page,
+ uint *bitmap_pattern);
+void _ma_bitmap_delete_all(MARIA_SHARE *share);
+int _ma_bitmap_create_first(MARIA_SHARE *share);
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header,
+ const uchar *data,
+ size_t data_length);
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header);
+uint _ma_apply_redo_purge_blocks(MARIA_HA *info, LSN lsn,
+ const uchar *header);
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header);
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t length);
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t length);
diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c
new file mode 100644
index 00000000000..6b1f9ec3fae
--- /dev/null
+++ b/storage/maria/ma_cache.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Functions for read record cacheing with maria
+ Used for reading dynamic/compressed records from datafile.
+
+ Can fetch data directly from file (outside cache),
+ if reading a small chunk straight before the cached part (with possible
+ overlap).
+
+ Can be explicitly asked not to use cache (by not setting READING_NEXT in
+ flag) - useful for occasional out-of-cache reads, when the next read is
+ expected to hit the cache again.
+
+ Allows "partial read" errors in the record header (when READING_HEADER flag
+ is set) - unread part is bzero'ed
+
+ Note: out-of-cache reads are enabled for shared IO_CACHE's too,
+ as these reads will be cached by OS cache (and my_pread is always atomic)
+*/
+
+
+#include "maria_def.h"
+
+int _ma_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, uint length,
+ int flag)
+{
+ uint read_length,in_buff_length;
+ my_off_t offset;
+ uchar *in_buff_pos;
+ DBUG_ENTER("_ma_read_cache");
+
+ if (pos < info->pos_in_file)
+ {
+ read_length=length;
+ if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos))
+ read_length=(uint) (info->pos_in_file-pos);
+ info->seek_not_done=1;
+ if (my_pread(info->file,buff,read_length,pos,MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ if (!(length-=read_length))
+ DBUG_RETURN(0);
+ pos+=read_length;
+ buff+=read_length;
+ }
+ if (pos >= info->pos_in_file &&
+ (offset= (my_off_t) (pos - info->pos_in_file)) <
+ (my_off_t) (info->read_end - info->request_pos))
+ {
+ in_buff_pos=info->request_pos+(uint) offset;
+ in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos));
+ memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length);
+ if (!(length-=in_buff_length))
+ DBUG_RETURN(0);
+ pos+=in_buff_length;
+ buff+=in_buff_length;
+ }
+ else
+ in_buff_length=0;
+ if (flag & READING_NEXT)
+ {
+ if (pos != (info->pos_in_file +
+ (uint) (info->read_end - info->request_pos)))
+ {
+ info->pos_in_file=pos; /* Force start here */
+ info->read_pos=info->read_end=info->request_pos; /* Everything used */
+ info->seek_not_done=1;
+ }
+ else
+ info->read_pos=info->read_end; /* All block used */
+ if (!(*info->read_function)(info,buff,length))
+ DBUG_RETURN(0);
+ read_length=info->error;
+ }
+ else
+ {
+ info->seek_not_done=1;
+ if ((read_length=my_pread(info->file,buff,length,pos,MYF(0))) == length)
+ DBUG_RETURN(0);
+ }
+ if (!(flag & READING_HEADER) || (int) read_length == -1 ||
+ read_length+in_buff_length < 3)
+ {
+ DBUG_PRINT("error",
+ ("Error %d reading next-multi-part block (Got %d bytes)",
+ my_errno, (int) read_length));
+ if (!my_errno || my_errno == -1)
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1);
+ }
+ bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length -
+ read_length);
+ DBUG_RETURN(0);
+} /* _ma_read_cache */
diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c
new file mode 100644
index 00000000000..4d0964581f6
--- /dev/null
+++ b/storage/maria/ma_changed.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Check if somebody has changed table since last check. */
+
+#include "maria_def.h"
+
+ /* Return 0 if table isn't changed */
+
+int maria_is_changed(MARIA_HA *info)
+{
+ int result;
+ DBUG_ENTER("maria_is_changed");
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(-1);
+ VOID(_ma_writeinfo(info,0));
+ result=(int) info->data_changed;
+ info->data_changed=0;
+ DBUG_PRINT("exit",("result: %d",result));
+ DBUG_RETURN(result);
+}
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
new file mode 100644
index 00000000000..fa1c812daf7
--- /dev/null
+++ b/storage/maria/ma_check.c
@@ -0,0 +1,5633 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Describe, check and repair of MARIA tables */
+
+/*
+ About checksum calculation.
+
+ There are two types of checksums. Table checksum and row checksum.
+
+ Row checksum is an additional uchar at the end of dynamic length
+ records. It must be calculated if the table is configured for them.
+ Otherwise they must not be used. The variable
+ MYISAM_SHARE::calc_checksum determines if row checksums are used.
+ MI_INFO::checksum is used as temporary storage during row handling.
+ For parallel repair we must assure that only one thread can use this
+ variable. There is no problem on the write side as this is done by one
+ thread only. But when checking a record after read this could go
+ wrong. But since all threads read through a common read buffer, it is
+ sufficient if only one thread checks it.
+
+ Table checksum is an eight uchar value in the header of the index file.
+ It can be calculated even if row checksums are not used. The variable
+ MI_CHECK::glob_crc is calculated over all records.
+ MI_SORT_PARAM::calc_checksum determines if this should be done. This
+ variable is not part of MI_CHECK because it must be set per thread for
+ parallel repair. The global glob_crc must be changed by one thread
+ only. And it is sufficient to calculate the checksum once only.
+*/
+
+#include "ma_ftdefs.h"
+#include <myisamchk.h>
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVISE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+
+/* Functions defined in this file */
+
+static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link);
+static int chk_index(HA_CHECK *param, MARIA_HA *info,MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff, ha_rows *keys,
+ ha_checksum *key_checksum, uint level);
+static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo);
+static ha_checksum calc_checksum(ha_rows count);
+static int writekeys(MARIA_SORT_PARAM *sort_param);
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t pagepos, File new_file);
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param);
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+ const void *b);
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+ const uchar *a);
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a);
+static my_off_t get_record_for_key(MARIA_HA *info,MARIA_KEYDEF *keyinfo,
+ const uchar *key);
+static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
+ reg1 SORT_KEY_BLOCKS *key_block,
+ const uchar *key, my_off_t prev_block);
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param);
+/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+ uint buffer_length);
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length);
+static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share);
+static void restore_data_file_type(MARIA_SHARE *share);
+static void change_data_file_descriptor(MARIA_HA *info, File new_file);
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+ MARIA_HA *info, uchar *record);
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+ MARIA_STATE_INFO *from);
+static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info);
+
+
+void maria_chk_init(HA_CHECK *param)
+{
+ bzero((uchar*) param,sizeof(*param));
+ param->opt_follow_links=1;
+ param->keys_in_use= ~(ulonglong) 0;
+ param->search_after_block=HA_OFFSET_ERROR;
+ param->auto_increment_value= 0;
+ param->use_buffers=USE_BUFFER_INIT;
+ param->read_buffer_length=READ_BUFFER_INIT;
+ param->write_buffer_length=READ_BUFFER_INIT;
+ param->sort_buffer_length=SORT_BUFFER_INIT;
+ param->sort_key_blocks=BUFFERS_WHEN_SORTING;
+ param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+ param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL);
+ param->start_check_pos=0;
+ param->max_record_length= LONGLONG_MAX;
+ param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE;
+ param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+}
+
+ /* Check the status flags for the table */
+
+int maria_chk_status(HA_CHECK *param, register MARIA_HA *info)
+{
+ MARIA_SHARE *share=info->s;
+
+ if (maria_is_crashed_on_repair(info))
+ _ma_check_print_warning(param,
+ "Table is marked as crashed and last repair failed");
+ else if (maria_is_crashed(info))
+ _ma_check_print_warning(param,
+ "Table is marked as crashed");
+ if (share->state.open_count != (uint) (info->s->global_changed ? 1 : 0))
+ {
+ /* Don't count this as a real warning, as check can correct this ! */
+ uint save=param->warning_printed;
+ _ma_check_print_warning(param,
+ share->state.open_count==1 ?
+ "%d client is using or hasn't closed the table properly" :
+ "%d clients are using or haven't closed the table properly",
+ share->state.open_count);
+ /* If this will be fixed by the check, forget the warning */
+ if (param->testflag & T_UPDATE_STATE)
+ param->warning_printed=save;
+ }
+ return 0;
+}
+
+/*
+ Check delete links in row data
+*/
+
+int maria_chk_del(HA_CHECK *param, register MARIA_HA *info, uint test_flag)
+{
+ reg2 ha_rows i;
+ uint delete_link_length;
+ my_off_t empty,next_link,old_link;
+ char buff[22],buff2[22];
+ DBUG_ENTER("maria_chk_del");
+
+ LINT_INIT(old_link);
+
+ if (info->s->data_file_type == BLOCK_RECORD)
+ DBUG_RETURN(0); /* No delete links here */
+
+ param->record_checksum=0;
+ delete_link_length=((info->s->options & HA_OPTION_PACK_RECORD) ? 20 :
+ info->s->rec_reflength+1);
+
+ if (!(test_flag & T_SILENT))
+ puts("- check record delete-chain");
+
+ next_link=info->s->state.dellink;
+ if (info->state->del == 0)
+ {
+ if (test_flag & T_VERBOSE)
+ {
+ puts("No recordlinks");
+ }
+ }
+ else
+ {
+ if (test_flag & T_VERBOSE)
+ printf("Recordlinks: ");
+ empty=0;
+ for (i= info->state->del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--)
+ {
+ if (*_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+ if (test_flag & T_VERBOSE)
+ printf(" %9s",llstr(next_link,buff));
+ if (next_link >= info->state->data_file_length)
+ goto wrong;
+ if (my_pread(info->dfile.file, (char*) buff, delete_link_length,
+ next_link,MYF(MY_NABP)))
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Can't read delete-link at filepos: %s",
+ llstr(next_link,buff));
+ DBUG_RETURN(1);
+ }
+ if (*buff != '\0')
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Record at pos: %s is not remove-marked",
+ llstr(next_link,buff));
+ goto wrong;
+ }
+ if (info->s->options & HA_OPTION_PACK_RECORD)
+ {
+ my_off_t prev_link=mi_sizekorr(buff+12);
+ if (empty && prev_link != old_link)
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Deleted block at %s doesn't point back at previous delete link",llstr(next_link,buff2));
+ goto wrong;
+ }
+ old_link=next_link;
+ next_link=mi_sizekorr(buff+4);
+ empty+=mi_uint3korr(buff+1);
+ }
+ else
+ {
+ param->record_checksum+=(ha_checksum) next_link;
+ next_link= _ma_rec_pos(info->s, buff+1);
+ empty+=info->s->base.pack_reclength;
+ }
+ }
+ if (test_flag & T_VERBOSE)
+ puts("\n");
+ if (empty != info->state->empty)
+ {
+ _ma_check_print_warning(param,
+ "Found %s deleted space in delete link chain. Should be %s",
+ llstr(empty,buff2),
+ llstr(info->state->empty,buff));
+ }
+ if (next_link != HA_OFFSET_ERROR)
+ {
+ _ma_check_print_error(param,
+ "Found more than the expected %s deleted rows in delete link chain",
+ llstr(info->state->del, buff));
+ goto wrong;
+ }
+ if (i != 0)
+ {
+ _ma_check_print_error(param,
+ "Found %s deleted rows in delete link chain. Should be %s",
+ llstr(info->state->del - i, buff2),
+ llstr(info->state->del, buff));
+ goto wrong;
+ }
+ }
+ DBUG_RETURN(0);
+
+wrong:
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"record delete-link-chain corrupted");
+ DBUG_RETURN(1);
+} /* maria_chk_del */
+
+
+ /* Check delete links in index file */
+
+static int check_k_link(HA_CHECK *param, register MARIA_HA *info,
+ my_off_t next_link)
+{
+ uint block_size= info->s->block_size;
+ ha_rows records;
+ char llbuff[21], llbuff2[21], *buff;
+ DBUG_ENTER("check_k_link");
+
+ records= (ha_rows) (info->state->key_file_length / block_size);
+ while (next_link != HA_OFFSET_ERROR && records > 0)
+ {
+ if (*_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+ if (param->testflag & T_VERBOSE)
+ printf("%16s",llstr(next_link,llbuff));
+
+ /* Key blocks must lay within the key file length entirely. */
+ if (next_link + block_size > info->state->key_file_length)
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Invalid key block position: %s "
+ "key block size: %u file_length: %s",
+ llstr(next_link, llbuff), block_size,
+ llstr(info->state->key_file_length, llbuff2));
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+
+ /* Key blocks must be aligned at block_size */
+ if (next_link & (block_size -1))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Mis-aligned key block: %s "
+ "minimum key block length: %u",
+ llstr(next_link, llbuff),
+ block_size);
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+
+ DBUG_ASSERT(info->s->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(info->s->pagecache,
+ &info->s->kfile, next_link/block_size,
+ DFLT_INIT_HITS,
+ (uchar*) info->buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "key cache read error for block: %s",
+ llstr(next_link,llbuff));
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+ next_link=mi_sizekorr(buff);
+ records--;
+ param->key_file_blocks+=block_size;
+ }
+ if (param->testflag & T_VERBOSE)
+ {
+ if (next_link != HA_OFFSET_ERROR)
+ printf("%16s\n",llstr(next_link,llbuff));
+ else
+ puts("");
+ }
+ DBUG_RETURN (next_link != HA_OFFSET_ERROR);
+} /* check_k_link */
+
+
+ /* Check sizes of files */
+
+int maria_chk_size(HA_CHECK *param, register MARIA_HA *info)
+{
+ int error;
+ register my_off_t skr,size;
+ char buff[22],buff2[22];
+ DBUG_ENTER("maria_chk_size");
+
+ if (!(param->testflag & T_SILENT))
+ puts("- check file-size");
+
+ /*
+ The following is needed if called externally (not from maria_chk).
+ To get a correct physical size we need to flush them.
+ */
+ if ((error= _ma_flush_table_files(info,
+ MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE)))
+ _ma_check_print_error(param, "Failed to flush data or index file");
+
+ size= my_seek(info->s->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE));
+ if ((skr=(my_off_t) info->state->key_file_length) != size)
+ {
+ /* Don't give error if file generated by mariapack */
+ if (skr > size && maria_is_any_key_active(info->s->state.key_map))
+ {
+ error=1;
+ _ma_check_print_error(param,
+ "Size of indexfile is: %-8s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ else if (!(param->testflag & T_VERY_SILENT))
+ _ma_check_print_warning(param,
+ "Size of indexfile is: %-8s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ if (!(param->testflag & T_VERY_SILENT) &&
+ ! (info->s->options & HA_OPTION_COMPRESS_RECORD) &&
+ ulonglong2double(info->state->key_file_length) >
+ ulonglong2double(info->s->base.margin_key_file_length)*0.9)
+ _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used",
+ llstr(info->state->key_file_length,buff),
+ llstr(info->s->base.max_key_file_length-1,buff));
+
+ size= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+ skr=(my_off_t) info->state->data_file_length;
+ if (info->s->options & HA_OPTION_COMPRESS_RECORD)
+ skr+= MEMMAP_EXTRA_MARGIN;
+#ifdef USE_RELOC
+ if (info->data_file_type == STATIC_RECORD &&
+ skr < (my_off_t) info->s->base.reloc*info->s->base.min_pack_length)
+ skr=(my_off_t) info->s->base.reloc*info->s->base.min_pack_length;
+#endif
+ if (skr != size)
+ {
+ info->state->data_file_length=size; /* Skip other errors */
+ if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN)
+ {
+ error=1;
+ _ma_check_print_error(param,"Size of datafile is: %-9s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ }
+ else
+ {
+ _ma_check_print_warning(param,
+ "Size of datafile is: %-9s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ }
+ if (!(param->testflag & T_VERY_SILENT) &&
+ !(info->s->options & HA_OPTION_COMPRESS_RECORD) &&
+ ulonglong2double(info->state->data_file_length) >
+ (ulonglong2double(info->s->base.max_data_file_length)*0.9))
+ _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used",
+ llstr(info->state->data_file_length,buff),
+ llstr(info->s->base.max_data_file_length-1,buff2));
+ DBUG_RETURN(error);
+} /* maria_chk_size */
+
+
+/* Check keys */
+
+int maria_chk_key(HA_CHECK *param, register MARIA_HA *info)
+{
+ uint key,found_keys=0,full_text_keys=0,result=0;
+ ha_rows keys;
+ ha_checksum old_record_checksum,init_checksum;
+ my_off_t all_keydata,all_totaldata,key_totlength,length;
+ ulong *rec_per_key_part;
+ MARIA_SHARE *share=info->s;
+ MARIA_KEYDEF *keyinfo;
+ char buff[22],buff2[22];
+ DBUG_ENTER("maria_chk_key");
+
+ if (!(param->testflag & T_SILENT))
+ puts("- check key delete-chain");
+
+ param->key_file_blocks=info->s->base.keystart;
+ if (check_k_link(param, info, info->s->state.key_del))
+ {
+ if (param->testflag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"key delete-link-chain corrupted");
+ DBUG_RETURN(-1);
+ }
+
+ if (!(param->testflag & T_SILENT)) puts("- check index reference");
+
+ all_keydata=all_totaldata=key_totlength=0;
+ old_record_checksum=0;
+ init_checksum=param->record_checksum;
+ if (share->data_file_type == STATIC_RECORD)
+ old_record_checksum= (calc_checksum(info->state->records +
+ info->state->del-1) *
+ share->base.pack_reclength);
+ rec_per_key_part= param->rec_per_key_part;
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++)
+ {
+ param->key_crc[key]=0;
+ if (! maria_is_key_active(share->state.key_map, key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part +
+ (uint) (rec_per_key_part - param->rec_per_key_part)),
+ keyinfo->keysegs*sizeof(*rec_per_key_part));
+ continue;
+ }
+ found_keys++;
+
+ param->record_checksum=init_checksum;
+
+ bzero((char*) &param->unique_count,sizeof(param->unique_count));
+ bzero((char*) &param->notnull_count,sizeof(param->notnull_count));
+
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- check data record references index: %d\n",key+1);
+ if (keyinfo->flag & HA_FULLTEXT)
+ full_text_keys++;
+ if (share->state.key_root[key] == HA_OFFSET_ERROR &&
+ (info->state->records == 0 || keyinfo->flag & HA_FULLTEXT))
+ goto do_stat;
+ if (!_ma_fetch_keypage(info,keyinfo,share->state.key_root[key],
+ DFLT_INIT_HITS,info->buff,0))
+ {
+ _ma_check_print_error(param,"Can't read indexpage from filepos: %s",
+ llstr(share->state.key_root[key],buff));
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ param->key_file_blocks+=keyinfo->block_length;
+ keys=0;
+ param->keydata=param->totaldata=0;
+ param->key_blocks=0;
+ param->max_level=0;
+ if (chk_index(param,info,keyinfo,share->state.key_root[key],info->buff,
+ &keys, param->key_crc+key,1))
+ DBUG_RETURN(-1);
+ if(!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL)))
+ {
+ if (keys != info->state->records)
+ {
+ _ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff),
+ llstr(info->state->records,buff2));
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ if ((found_keys - full_text_keys == 1 &&
+ !(share->data_file_type == STATIC_RECORD)) ||
+ (param->testflag & T_DONT_CHECK_CHECKSUM))
+ old_record_checksum= param->record_checksum;
+ else if (old_record_checksum != param->record_checksum)
+ {
+ if (key)
+ _ma_check_print_error(param,"Key %u doesn't point at same records that key 1",
+ key+1);
+ else
+ _ma_check_print_error(param,"Key 1 doesn't point at all records");
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ }
+ if ((uint) share->base.auto_key -1 == key)
+ {
+ /* Check that auto_increment key is bigger than max key value */
+ ulonglong auto_increment;
+ info->lastinx=key;
+ _ma_read_key_record(info, info->rec_buff, 0);
+ auto_increment= ma_retrieve_auto_increment(info, info->rec_buff);
+ if (auto_increment > info->s->state.auto_increment)
+ {
+ _ma_check_print_warning(param, "Auto-increment value: %s is smaller "
+ "than max used value: %s",
+ llstr(info->s->state.auto_increment,buff2),
+ llstr(auto_increment, buff));
+ }
+ if (param->testflag & T_AUTO_INC)
+ {
+ set_if_bigger(info->s->state.auto_increment,
+ auto_increment);
+ set_if_bigger(info->s->state.auto_increment,
+ param->auto_increment_value);
+ }
+
+ /* Check that there isn't a row with auto_increment = 0 in the table */
+ maria_extra(info,HA_EXTRA_KEYREAD,0);
+ bzero(info->lastkey,keyinfo->seg->length);
+ if (!maria_rkey(info, info->rec_buff, key, (const uchar*) info->lastkey,
+ (key_part_map)1, HA_READ_KEY_EXACT))
+ {
+ /* Don't count this as a real warning, as maria_chk can't correct it */
+ uint save=param->warning_printed;
+ _ma_check_print_warning(param, "Found row where the auto_increment "
+ "column has the value 0");
+ param->warning_printed=save;
+ }
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ }
+
+ length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2;
+ if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L)
+ printf("Key: %2d: Keyblocks used: %3d%% Packed: %4d%% Max levels: %2d\n",
+ key+1,
+ (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)),
+ (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/
+ my_off_t2double(length)),
+ param->max_level);
+ all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length;
+
+do_stat:
+ if (param->testflag & T_STATISTICS)
+ maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count,
+ param->stats_method == MI_STATS_METHOD_IGNORE_NULLS?
+ param->notnull_count: NULL,
+ (ulonglong)info->state->records);
+ }
+ if (param->testflag & T_INFO)
+ {
+ if (all_totaldata != 0L && found_keys > 0)
+ printf("Total: Keyblocks used: %3d%% Packed: %4d%%\n\n",
+ (int) (my_off_t2double(all_keydata)*100.0/
+ my_off_t2double(all_totaldata)),
+ (int) ((my_off_t2double(key_totlength) -
+ my_off_t2double(all_keydata))*100.0/
+ my_off_t2double(key_totlength)));
+ else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map))
+ puts("");
+ }
+ if (param->key_file_blocks != info->state->key_file_length &&
+ param->keys_in_use != ~(ulonglong) 0)
+ _ma_check_print_warning(param, "Some data are unreferenced in keyfile");
+ if (found_keys != full_text_keys)
+ param->record_checksum=old_record_checksum-init_checksum; /* Remove delete links */
+ else
+ param->record_checksum=0;
+ DBUG_RETURN(result);
+} /* maria_chk_key */
+
+
+static int chk_index_down(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff, ha_rows *keys,
+ ha_checksum *key_checksum, uint level)
+{
+ char llbuff[22],llbuff2[22];
+ DBUG_ENTER("chk_index_down");
+
+ /* Key blocks must lay within the key file length entirely. */
+ if (page + keyinfo->block_length > info->state->key_file_length)
+ {
+ /* purecov: begin tested */
+ /* Give it a chance to fit in the real file size. */
+ my_off_t max_length= my_seek(info->s->kfile.file, 0L, MY_SEEK_END,
+ MYF(MY_THREADSAFE));
+ _ma_check_print_error(param, "Invalid key block position: %s "
+ "key block size: %u file_length: %s",
+ llstr(page, llbuff), keyinfo->block_length,
+ llstr(info->state->key_file_length, llbuff2));
+ if (page + keyinfo->block_length > max_length)
+ goto err;
+ /* Fix the remembered key file length. */
+ info->state->key_file_length= (max_length &
+ ~ (my_off_t) (keyinfo->block_length - 1));
+ /* purecov: end */
+ }
+
+ /* Key blocks must be aligned at block length */
+ if (page & (info->s->block_size -1))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Mis-aligned key block: %s "
+ "minimum key block length: %u",
+ llstr(page, llbuff), info->s->block_size);
+ goto err;
+ /* purecov: end */
+ }
+
+ if (!_ma_fetch_keypage(info,keyinfo,page, DFLT_INIT_HITS,buff,0))
+ {
+ _ma_check_print_error(param,"Can't read key from filepos: %s",
+ llstr(page,llbuff));
+ goto err;
+ }
+ param->key_file_blocks+=keyinfo->block_length;
+ if (chk_index(param,info,keyinfo,page,buff,keys,key_checksum,level))
+ goto err;
+
+ DBUG_RETURN(0);
+
+ /* purecov: begin tested */
+err:
+ DBUG_RETURN(1);
+ /* purecov: end */
+}
+
+
+/*
+ "Ignore NULLs" statistics collection method: process first index tuple.
+
+ SYNOPSIS
+ maria_collect_stats_nonulls_first()
+ keyseg IN Array of key part descriptions
+ notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i}
+ tuples that don't contain NULLs)
+ key IN Key values tuple
+
+ DESCRIPTION
+ Process the first index tuple - find out which prefix tuples don't
+ contain NULLs, and update the array of notnull counters accordingly.
+*/
+
+static
+void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull,
+ const uchar *key)
+{
+ uint first_null, kp;
+ first_null= ha_find_null(keyseg, (uchar*) key) - keyseg;
+ /*
+ All prefix tuples that don't include keypart_{first_null} are not-null
+ tuples (and all others aren't), increment counters for them.
+ */
+ for (kp= 0; kp < first_null; kp++)
+ notnull[kp]++;
+}
+
+
+/*
+ "Ignore NULLs" statistics collection method: process next index tuple.
+
+ SYNOPSIS
+ maria_collect_stats_nonulls_next()
+ keyseg IN Array of key part descriptions
+ notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i}
+ tuples that don't contain NULLs)
+ prev_key IN Previous key values tuple
+ last_key IN Next key values tuple
+
+ DESCRIPTION
+ Process the next index tuple:
+ 1. Find out which prefix tuples of last_key don't contain NULLs, and
+ update the array of notnull counters accordingly.
+ 2. Find the first keypart number where the prev_key and last_key tuples
+ are different(A), or last_key has NULL value(B), and return it, so the
+ caller can count number of unique tuples for each key prefix. We don't
+ need (B) to be counted, and that is compensated back in
+ maria_update_key_parts().
+
+ RETURN
+ 1 + number of first keypart where values differ or last_key tuple has NULL
+*/
+
+static
+int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull,
+ const uchar *prev_key,
+ const uchar *last_key)
+{
+ uint diffs[2];
+ uint first_null_seg, kp;
+ HA_KEYSEG *seg;
+
+ /*
+ Find the first keypart where values are different or either of them is
+ NULL. We get results in diffs array:
+ diffs[0]= 1 + number of first different keypart
+ diffs[1]=offset: (last_key + diffs[1]) points to first value in
+ last_key that is NULL or different from corresponding
+ value in prev_key.
+ */
+ ha_key_cmp(keyseg, (uchar*) prev_key, (uchar*) last_key, USE_WHOLE_KEY,
+ SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs);
+ seg= keyseg + diffs[0] - 1;
+
+ /* Find first NULL in last_key */
+ first_null_seg= ha_find_null(seg, (uchar*) last_key + diffs[1]) - keyseg;
+ for (kp= 0; kp < first_null_seg; kp++)
+ notnull[kp]++;
+
+ /*
+ Return 1+ number of first key part where values differ. Don't care if
+ these were NULLs and not .... We compensate for that in
+ maria_update_key_parts.
+ */
+ return diffs[0];
+}
+
+
+ /* Check if index is ok */
+
+static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff, ha_rows *keys,
+ ha_checksum *key_checksum, uint level)
+{
+ int flag;
+ uint used_length,comp_flag,nod_flag,key_length=0;
+ uchar key[HA_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos;
+ my_off_t next_page,record;
+ char llbuff[22];
+ uint diff_pos[2];
+ DBUG_ENTER("chk_index");
+ DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff));
+
+ /* TODO: implement appropriate check for RTree keys */
+ if (keyinfo->flag & HA_SPATIAL)
+ DBUG_RETURN(0);
+
+ if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not enough memory for keyblock");
+ DBUG_RETURN(-1);
+ }
+
+ if (keyinfo->flag & HA_NOSAME)
+ comp_flag=SEARCH_FIND | SEARCH_UPDATE; /* Not real duplicates */
+ else
+ comp_flag=SEARCH_SAME; /* Keys in positionorder */
+ nod_flag=_ma_test_if_nod(buff);
+ used_length= maria_data_on_page(buff);
+ keypos=buff+2+nod_flag;
+ endpos=buff+used_length;
+
+ param->keydata+=used_length; param->totaldata+=keyinfo->block_length; /* INFO */
+ param->key_blocks++;
+ if (level > param->max_level)
+ param->max_level=level;
+
+ if (used_length > keyinfo->block_length)
+ {
+ _ma_check_print_error(param,"Wrong pageinfo at page: %s",
+ llstr(page,llbuff));
+ goto err;
+ }
+ for ( ;; )
+ {
+ if (*_ma_killed_ptr(param))
+ goto err;
+ memcpy(info->lastkey, key, key_length);
+ info->lastkey_length= key_length;
+ if (nod_flag)
+ {
+ next_page= _ma_kpos(nod_flag,keypos);
+ if (chk_index_down(param,info,keyinfo,next_page,
+ temp_buff,keys,key_checksum,level+1))
+ goto err;
+ }
+ old_keypos=keypos;
+ if (keypos >= endpos ||
+ (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,key)) == 0)
+ break;
+ if (keypos > endpos)
+ {
+ _ma_check_print_error(param,"Wrong key block length at page: %s",
+ llstr(page,llbuff));
+ goto err;
+ }
+ if ((*keys)++ &&
+ (flag=ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key,
+ key_length, comp_flag, diff_pos)) >=0)
+ {
+ DBUG_DUMP("old", info->lastkey, info->lastkey_length);
+ DBUG_DUMP("new", key, key_length);
+ DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos));
+
+ if (comp_flag & SEARCH_FIND && flag == 0)
+ _ma_check_print_error(param,"Found duplicated key at page %s",
+ llstr(page,llbuff));
+ else
+ _ma_check_print_error(param,"Key in wrong position at page %s",
+ llstr(page,llbuff));
+ goto err;
+ }
+ if (param->testflag & T_STATISTICS)
+ {
+ if (*keys != 1L) /* not first_key */
+ {
+ if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+ ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key,
+ USE_WHOLE_KEY, SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL,
+ diff_pos);
+ else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ {
+ diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg,
+ param->notnull_count,
+ info->lastkey, key);
+ }
+ param->unique_count[diff_pos[0]-1]++;
+ }
+ else
+ {
+ if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count,
+ key);
+ }
+ }
+ (*key_checksum)+= maria_byte_checksum((uchar*) key,
+ key_length- info->s->rec_reflength);
+ record= _ma_dpos(info,0,key+key_length);
+ if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */
+ {
+ uint off;
+ int subkeys;
+ get_key_full_length_rdonly(off, key);
+ subkeys=ft_sintXkorr(key+off);
+ if (subkeys < 0)
+ {
+ ha_rows tmp_keys=0;
+ if (chk_index_down(param,info,&info->s->ft2_keyinfo,record,
+ temp_buff,&tmp_keys,key_checksum,1))
+ goto err;
+ if (tmp_keys + subkeys)
+ {
+ _ma_check_print_error(param,
+ "Number of words in the 2nd level tree "
+ "does not match the number in the header. "
+ "Parent word in on the page %s, offset %u",
+ llstr(page,llbuff), (uint) (old_keypos-buff));
+ goto err;
+ }
+ (*keys)+=tmp_keys-1;
+ continue;
+ }
+ /* fall through */
+ }
+ if (record >= info->state->data_file_length)
+ {
+#ifndef DBUG_OFF
+ char llbuff2[22], llbuff3[22];
+#endif
+ _ma_check_print_error(param,"Found key at page %s that points to record outside datafile",llstr(page,llbuff));
+ DBUG_PRINT("test",("page: %s record: %s filelength: %s",
+ llstr(page,llbuff),llstr(record,llbuff2),
+ llstr(info->state->data_file_length,llbuff3)));
+ DBUG_DUMP("key",(uchar*) key,key_length);
+ DBUG_DUMP("new_in_page",(char*) old_keypos,(uint) (keypos-old_keypos));
+ goto err;
+ }
+ param->record_checksum+= (ha_checksum) record;
+ }
+ if (keypos != endpos)
+ {
+ _ma_check_print_error(param,"Keyblock size at page %s is not correct. Block length: %d key length: %d",
+ llstr(page,llbuff), used_length, (keypos - buff));
+ goto err;
+ }
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(0);
+ err:
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(1);
+} /* chk_index */
+
+
+ /* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */
+
+static ha_checksum calc_checksum(ha_rows count)
+{
+ ulonglong sum,a,b;
+ DBUG_ENTER("calc_checksum");
+
+ sum=0;
+ a=count; b=count+1;
+ if (a & 1)
+ b>>=1;
+ else
+ a>>=1;
+ while (b)
+ {
+ if (b & 1)
+ sum+=a;
+ a<<=1; b>>=1;
+ }
+ DBUG_PRINT("exit",("sum: %lx",(ulong) sum));
+ DBUG_RETURN((ha_checksum) sum);
+} /* calc_checksum */
+
+
+ /* Calc length of key in normal isam */
+
+static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo)
+{
+ uint length;
+ HA_KEYSEG *keyseg;
+ DBUG_ENTER("isam_key_length");
+
+ length= info->s->rec_reflength;
+ for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+ length+= keyseg->length;
+
+ DBUG_PRINT("exit",("length: %d",length));
+ DBUG_RETURN(length);
+} /* key_length */
+
+
+
+static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
+ char *buff)
+{
+ if (info->s->data_file_type != BLOCK_RECORD)
+ llstr(recpos, buff);
+ else
+ {
+ my_off_t page= ma_recordpos_to_page(recpos);
+ uint row= ma_recordpos_to_dir_entry(recpos);
+ char *end= longlong10_to_str(page, buff, 10);
+ *(end++)= ':';
+ longlong10_to_str(row, end, 10);
+ }
+}
+
+
+/*
+ Check that keys in records exist in index tree
+
+ SYNOPSIS
+ check_keys_in_record()
+ param Check paramenter
+ info Maria handler
+ extend Type of check (extended or normal)
+ start_recpos Position to row
+ record Record buffer
+
+ NOTES
+ This function also calculates record checksum & number of rows
+*/
+
+static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ my_off_t start_recpos, uchar *record)
+{
+ MARIA_KEYDEF *keyinfo;
+ char llbuff[22+4];
+ uint key;
+
+ param->tmp_record_checksum+= (ha_checksum) start_recpos;
+ param->records++;
+ if (param->testflag & T_WRITE_LOOP && param->records % WRITE_COUNT == 0)
+ {
+ printf("%s\r", llstr(param->records, llbuff));
+ VOID(fflush(stdout));
+ }
+
+ /* Check if keys match the record */
+ for (key=0, keyinfo= info->s->keyinfo; key < info->s->base.keys;
+ key++,keyinfo++)
+ {
+ if (maria_is_key_active(info->s->state.key_map, key))
+ {
+ if(!(keyinfo->flag & HA_FULLTEXT))
+ {
+ uint key_length= _ma_make_key(info,key,info->lastkey,record,
+ start_recpos);
+ if (extend)
+ {
+ /* We don't need to lock the key tree here as we don't allow
+ concurrent threads when running maria_chk
+ */
+ int search_result=
+#ifdef HAVE_RTREE_KEYS
+ (keyinfo->flag & HA_SPATIAL) ?
+ maria_rtree_find_first(info, key, info->lastkey, key_length,
+ MBR_EQUAL | MBR_DATA) :
+#endif
+ _ma_search(info,keyinfo,info->lastkey,key_length,
+ SEARCH_SAME, info->s->state.key_root[key]);
+ if (search_result)
+ {
+ record_pos_to_txt(info, start_recpos, llbuff);
+ _ma_check_print_error(param,
+ "Record at: %14s "
+ "Can't find key for index: %2d",
+ llbuff, key+1);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ return -1;
+ }
+ }
+ else
+ param->tmp_key_crc[key]+=
+ maria_byte_checksum((uchar*) info->lastkey, key_length);
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Functions to loop through all rows and check if they are ok
+
+ NOTES
+ One function for each record format
+
+ RESULT
+ 0 ok
+ -1 Interrupted by user
+ 1 Error
+*/
+
+static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ my_off_t start_recpos, pos;
+ char llbuff[22];
+
+ pos= 0;
+ while (pos < info->state->data_file_length)
+ {
+ if (*_ma_killed_ptr(param))
+ return -1;
+ if (my_b_read(&param->read_cache,(uchar*) record,
+ info->s->base.pack_reclength))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: %s",
+ my_errno, llstr(pos, llbuff));
+ return 1;
+ }
+ start_recpos= pos;
+ pos+= info->s->base.pack_reclength;
+ param->splits++;
+ if (*record == '\0')
+ {
+ param->del_blocks++;
+ param->del_length+= info->s->base.pack_reclength;
+ continue; /* Record removed */
+ }
+ param->glob_crc+= _ma_static_checksum(info,record);
+ param->used+= info->s->base.pack_reclength;
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ return 1;
+ }
+ return 0;
+}
+
+
+static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ MARIA_BLOCK_INFO block_info;
+ my_off_t start_recpos, start_block, pos;
+ uchar *to;
+ ulong left_length;
+ uint b_type;
+ char llbuff[22],llbuff2[22],llbuff3[22];
+ DBUG_ENTER("check_dynamic_record");
+
+ LINT_INIT(left_length);
+ LINT_INIT(start_recpos);
+ LINT_INIT(to);
+
+ pos= 0;
+ while (pos < info->state->data_file_length)
+ {
+ my_bool got_error= 0;
+ int flag;
+ if (*_ma_killed_ptr(param))
+ DBUG_RETURN(-1);
+
+ flag= block_info.second_read=0;
+ block_info.next_filepos=pos;
+ do
+ {
+ if (_ma_read_cache(&param->read_cache,(uchar*) block_info.header,
+ (start_block=block_info.next_filepos),
+ sizeof(block_info.header),
+ (flag ? 0 : READING_NEXT) | READING_HEADER))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at "
+ "position: %s",
+ my_errno, llstr(start_block, llbuff));
+ DBUG_RETURN(1);
+ }
+
+ if (start_block & (MARIA_DYN_ALIGN_SIZE-1))
+ {
+ _ma_check_print_error(param,"Wrong aligned block at %s",
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ b_type= _ma_get_block_info(&block_info,-1,start_block);
+ if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & BLOCK_SYNC_ERROR)
+ {
+ if (flag)
+ {
+ _ma_check_print_error(param,"Unexpected byte: %d at link: %s",
+ (int) block_info.header[0],
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ pos=block_info.filepos+block_info.block_len;
+ goto next;
+ }
+ if (b_type & BLOCK_DELETED)
+ {
+ if (block_info.block_len < info->s->base.min_block_length)
+ {
+ _ma_check_print_error(param,
+ "Deleted block with impossible length %lu at %s",
+ block_info.block_len,llstr(pos,llbuff));
+ DBUG_RETURN(1);
+ }
+ if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+ block_info.next_filepos >= info->state->data_file_length) ||
+ (block_info.prev_filepos != HA_OFFSET_ERROR &&
+ block_info.prev_filepos >= info->state->data_file_length))
+ {
+ _ma_check_print_error(param,"Delete link points outside datafile at %s",
+ llstr(pos,llbuff));
+ DBUG_RETURN(1);
+ }
+ param->del_blocks++;
+ param->del_length+= block_info.block_len;
+ param->splits++;
+ pos= block_info.filepos+block_info.block_len;
+ goto next;
+ }
+ _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s",
+ block_info.header[0],block_info.header[1],
+ block_info.header[2],
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ if (info->state->data_file_length < block_info.filepos+
+ block_info.block_len)
+ {
+ _ma_check_print_error(param,
+ "Recordlink that points outside datafile at %s",
+ llstr(pos,llbuff));
+ got_error=1;
+ break;
+ }
+ param->splits++;
+ if (!flag++) /* First block */
+ {
+ start_recpos=pos;
+ pos=block_info.filepos+block_info.block_len;
+ if (block_info.rec_len > (uint) info->s->base.max_pack_length)
+ {
+ _ma_check_print_error(param,"Found too long record (%lu) at %s",
+ (ulong) block_info.rec_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ if (info->s->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+
+ {
+ _ma_check_print_error(param,
+ "Not enough memory (%lu) for blob at %s",
+ (ulong) block_info.rec_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ }
+ to= info->rec_buff;
+ left_length= block_info.rec_len;
+ }
+ if (left_length < block_info.data_len)
+ {
+ _ma_check_print_error(param,"Found too long record (%lu) at %s",
+ (ulong) block_info.data_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ if (_ma_read_cache(&param->read_cache,(uchar*) to,block_info.filepos,
+ (uint) block_info.data_len,
+ flag == 1 ? READING_NEXT : 0))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: %s", my_errno, llstr(block_info.filepos, llbuff));
+
+ DBUG_RETURN(1);
+ }
+ to+=block_info.data_len;
+ param->link_used+= block_info.filepos-start_block;
+ param->used+= block_info.filepos - start_block + block_info.data_len;
+ param->empty+= block_info.block_len-block_info.data_len;
+ left_length-= block_info.data_len;
+ if (left_length)
+ {
+ if (b_type & BLOCK_LAST)
+ {
+ _ma_check_print_error(param,
+ "Wrong record length %s of %s at %s",
+ llstr(block_info.rec_len-left_length,llbuff),
+ llstr(block_info.rec_len, llbuff2),
+ llstr(start_recpos,llbuff3));
+ got_error=1;
+ break;
+ }
+ if (info->state->data_file_length < block_info.next_filepos)
+ {
+ _ma_check_print_error(param,
+ "Found next-recordlink that points outside datafile at %s",
+ llstr(block_info.filepos,llbuff));
+ got_error=1;
+ break;
+ }
+ }
+ } while (left_length);
+
+ if (! got_error)
+ {
+ if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) ==
+ MY_FILE_ERROR)
+ {
+ _ma_check_print_error(param,"Found wrong record at %s",
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ }
+ else
+ {
+ ha_checksum checksum= 0;
+ if (info->s->calc_checksum)
+ checksum= (*info->s->calc_checksum)(info, record);
+
+ if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE))
+ {
+ if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len,
+ test(info->s->calc_checksum), checksum))
+ {
+ _ma_check_print_error(param,"Found wrong packed record at %s",
+ llstr(start_recpos,llbuff));
+ got_error= 1;
+ }
+ }
+ param->glob_crc+= checksum;
+ }
+
+ if (! got_error)
+ {
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ else if (!flag)
+ pos= block_info.filepos+block_info.block_len;
+next:;
+ }
+ DBUG_RETURN(0);
+}
+
+
+static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ my_off_t start_recpos, pos;
+ char llbuff[22];
+ bool got_error= 0;
+ MARIA_BLOCK_INFO block_info;
+ DBUG_ENTER("check_compressed_record");
+
+ pos= info->s->pack.header_length; /* Skip header */
+ while (pos < info->state->data_file_length)
+ {
+ if (*_ma_killed_ptr(param))
+ DBUG_RETURN(-1);
+
+ if (_ma_read_cache(&param->read_cache,(uchar*) block_info.header, pos,
+ info->s->pack.ref_length, READING_NEXT))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: %s",
+ my_errno, llstr(pos, llbuff));
+ DBUG_RETURN(1);
+ }
+
+ start_recpos= pos;
+ param->splits++;
+ VOID(_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size, -1,
+ start_recpos));
+ pos=block_info.filepos+block_info.rec_len;
+ if (block_info.rec_len < (uint) info->s->min_pack_length ||
+ block_info.rec_len > (uint) info->s->max_pack_length)
+ {
+ _ma_check_print_error(param,
+ "Found block with wrong recordlength: %d at %s",
+ block_info.rec_len, llstr(start_recpos,llbuff));
+ got_error=1;
+ goto end;
+ }
+ if (_ma_read_cache(&param->read_cache,(uchar*) info->rec_buff,
+ block_info.filepos, block_info.rec_len, READING_NEXT))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: %s",
+ my_errno, llstr(block_info.filepos, llbuff));
+ DBUG_RETURN(1);
+ }
+ if (_ma_pack_rec_unpack(info, &info->bit_buff, record,
+ info->rec_buff, block_info.rec_len))
+ {
+ _ma_check_print_error(param,"Found wrong record at %s",
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ goto end;
+ }
+ param->glob_crc+= (*info->s->calc_checksum)(info,record);
+ param->link_used+= (block_info.filepos - start_recpos);
+ param->used+= (pos-start_recpos);
+
+end:
+ if (! got_error)
+ {
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ got_error= 0; /* Reset for next loop */
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check if layout on a page is ok
+
+ NOTES
+ This is for rows-in-block format.
+*/
+
+static int check_page_layout(HA_CHECK *param, MARIA_HA *info,
+ my_off_t page_pos, uchar *page,
+ uint row_count, uint head_empty,
+ uint *real_rows_found)
+{
+ uint empty, last_row_end, row, first_dir_entry;
+ uchar *dir_entry;
+ char llbuff[22];
+ DBUG_ENTER("check_page_layout");
+
+ empty= 0;
+ last_row_end= PAGE_HEADER_SIZE;
+ *real_rows_found= 0;
+
+ dir_entry= page+ info->s->block_size - PAGE_SUFFIX_SIZE;
+ first_dir_entry= info->s->block_size - row_count* DIR_ENTRY_SIZE;
+ for (row= 0 ; row < row_count ; row++)
+ {
+ uint pos, length;
+ dir_entry-= DIR_ENTRY_SIZE;
+ pos= uint2korr(dir_entry);
+ if (!pos)
+ {
+ if (row == row_count -1)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: First entry in directory is 0",
+ llstr(page_pos, llbuff));
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ continue; /* Deleted row */
+ }
+ (*real_rows_found)++;
+ length= uint2korr(dir_entry+2);
+ param->used+= length;
+ if (pos < last_row_end)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u overlapps with previous row",
+ llstr(page_pos, llbuff), row);
+ DBUG_RETURN(1);
+ }
+ empty+= (pos - last_row_end);
+ last_row_end= pos + length;
+ if (last_row_end > first_dir_entry)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u overlapps with directory",
+ llstr(page_pos, llbuff), row);
+ DBUG_RETURN(1);
+ }
+ }
+ empty+= (first_dir_entry - last_row_end);
+
+ if (empty != head_empty)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Wrong empty size. Stored: %5u Actual: %5u",
+ llstr(page_pos, llbuff), head_empty, empty);
+ DBUG_RETURN(param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE));
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check all rows on head page
+
+ NOTES
+ This is for rows-in-block format.
+
+ Before this, we have already called check_page_layout(), so
+ we know the block is logicaly correct (even if the rows may not be that)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+
+static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record,
+ int extend, my_off_t page_pos, uchar *page_buff,
+ uint row_count)
+{
+ uchar *dir_entry;
+ uint row;
+ char llbuff[22], llbuff2[22];
+ DBUG_ENTER("check_head_page");
+
+ dir_entry= page_buff+ info->s->block_size - PAGE_SUFFIX_SIZE;
+ for (row= 0 ; row < row_count ; row++)
+ {
+ uint pos, length, flag;
+ dir_entry-= DIR_ENTRY_SIZE;
+ pos= uint2korr(dir_entry);
+ if (!pos)
+ continue;
+ length= uint2korr(dir_entry+2);
+ if (length < info->s->base.min_block_length)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u is too short (%d bytes)",
+ llstr(page_pos, llbuff), row, length);
+ DBUG_RETURN(1);
+ }
+ flag= (uint) (uchar) page_buff[pos];
+ if (flag & ~(ROW_FLAG_ALL))
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u has wrong flag: %d",
+ llstr(page_pos, llbuff), row, flag);
+
+ DBUG_PRINT("info", ("rowid: %s page: %lu row: %u",
+ llstr(ma_recordpos(page_pos/info->s->block_size, row),
+ llbuff),
+ (ulong) (page_pos / info->s->block_size), row));
+ if (_ma_read_block_record2(info, record, page_buff+pos,
+ page_buff+pos+length))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3d is crashed",
+ llstr(page_pos, llbuff), row);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ continue;
+ }
+ if (info->s->calc_checksum)
+ {
+ ha_checksum checksum= (*info->s->calc_checksum)(info, record);
+ if (info->cur_row.checksum != (checksum & 255))
+ _ma_check_print_error(param, "Page %9s: Row %3d has wrong checksum",
+ llstr(page_pos, llbuff), row);
+ param->glob_crc+= checksum;
+ }
+ if (info->cur_row.extents_count)
+ {
+ uchar *extents= info->cur_row.extents;
+ uint i;
+ /* Check that bitmap has the right marker for the found extents */
+ for (i= 0 ; i < info->cur_row.extents_count ; i++)
+ {
+ uint page, page_count, page_type;
+ page= uint5korr(extents);
+ page_count= uint2korr(extents+5);
+ extents+= ROW_EXTENT_SIZE;
+ page_type= BLOB_PAGE;
+ if (page_count & TAIL_BIT)
+ {
+ page_count= 1;
+ page_type= TAIL_PAGE;
+ }
+ for ( ; page_count--; page++)
+ {
+ uint bitmap_pattern;
+ if (_ma_check_if_right_bitmap_type(info, page_type, page,
+ &bitmap_pattern))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row: %3d has an extent with wrong information in bitmap: Page %9s Page_type: %d Bitmap: %d",
+ llstr(page_pos, llbuff), row,
+ llstr(page * info->s->bitmap.block_size,
+ llbuff2),
+ page_type,
+ bitmap_pattern);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ }
+ param->full_page_count+= info->cur_row.full_page_count;
+ param->tail_count+= info->cur_row.tail_count;
+ if (check_keys_in_record(param, info, extend,
+ ma_recordpos(page_pos/info->s->block_size, row),
+ record))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check if rows-in-block data file is consistent
+*/
+
+static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ my_off_t pos;
+ uchar *page_buff, *bitmap_buff, *data;
+ char llbuff[22], llbuff2[22];
+ uint block_size= info->s->block_size;
+ ha_rows full_page_count, tail_count;
+ my_bool full_dir;
+ uint offset_page, offset;
+
+ LINT_INIT(full_dir);
+
+ if (_ma_scan_init_block_record(info))
+ {
+ _ma_check_print_error(param, "got error %d when initializing scan",
+ my_errno);
+ return 1;
+ }
+ bitmap_buff= info->scan.bitmap_buff;
+ page_buff= info->scan.page_buff;
+ full_page_count= tail_count= 0;
+ param->full_page_count= param->tail_count= 0;
+ param->used= param->link_used= 0;
+
+ for (pos= 0;
+ pos < info->state->data_file_length;
+ pos+= block_size)
+ {
+ uint row_count, real_row_count, empty_space, page_type, bitmap_pattern;
+ LINT_INIT(row_count);
+ LINT_INIT(empty_space);
+
+ if (*_ma_killed_ptr(param))
+ {
+ _ma_scan_end_block_record(info);
+ return -1;
+ }
+ if (((pos / block_size) % info->s->bitmap.pages_covered) == 0)
+ {
+ /* Bitmap page */
+ if (pagecache_read(info->s->pagecache,
+ &info->dfile,
+ (pos / block_size), 1,
+ bitmap_buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error: %d when reading datafile",
+ my_errno, llstr(pos, llbuff));
+ goto err;
+ }
+ param->used+= block_size;
+ param->link_used+= block_size;
+ continue;
+ }
+ /* Skip pages marked as empty in bitmap */
+ offset_page= (((pos / block_size) % info->s->bitmap.pages_covered) -1) * 3;
+ offset= offset_page & 7;
+ data= bitmap_buff + offset_page / 8;
+ bitmap_pattern= uint2korr(data);
+ param->splits++;
+ if (!((bitmap_pattern >> offset) & 7))
+ {
+ param->empty+= block_size;
+ param->del_blocks++;
+ continue;
+ }
+
+ if (pagecache_read(info->s->pagecache,
+ &info->dfile,
+ (pos / block_size), 1,
+ page_buff,
+ info->s->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error: %d when reading datafile",
+ my_errno, llstr(pos, llbuff));
+ goto err;
+ }
+ page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK;
+ if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Found wrong page type %d\n",
+ llstr(pos, llbuff), page_type);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ goto err;
+ continue;
+ }
+ switch ((enum en_page_type) page_type) {
+ case UNALLOCATED_PAGE:
+ case MAX_PAGE_TYPE:
+ DBUG_ASSERT(0); /* Impossible */
+ break;
+ case HEAD_PAGE:
+ row_count= ((uchar*) page_buff)[DIR_COUNT_OFFSET];
+ empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+ param->used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ full_dir= row_count == MAX_ROWS_PER_PAGE;
+ break;
+ case TAIL_PAGE:
+ row_count= ((uchar*) page_buff)[DIR_COUNT_OFFSET];
+ empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+ param->used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ full_dir= row_count == MAX_ROWS_PER_PAGE;
+ break;
+ case BLOB_PAGE:
+ full_page_count++;
+ full_dir= 0;
+ empty_space= block_size; /* for error reporting */
+ param->link_used+= (LSN_SIZE + PAGE_TYPE_SIZE);
+ param->used+= block_size;
+ break;
+ }
+ if (_ma_check_bitmap_data(info, page_type, pos / block_size,
+ full_dir ? 0 : empty_space,
+ &bitmap_pattern))
+ {
+ if (bitmap_pattern == ~(uint) 0)
+ _ma_check_print_error(param,
+ "Page: %9s: Wrong bitmap for data on page",
+ llstr(pos, llbuff));
+ else
+ _ma_check_print_error(param,
+ "Page %9s: Wrong data in bitmap. Page_type: %d empty_space: %u Bitmap-bits: %d",
+ llstr(pos, llbuff), page_type, empty_space,
+ bitmap_pattern);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ goto err;
+ }
+ if ((enum en_page_type) page_type == BLOB_PAGE)
+ continue;
+ param->empty+= empty_space;
+ if (check_page_layout(param, info, pos, page_buff, row_count,
+ empty_space, &real_row_count))
+ goto err;
+ if ((enum en_page_type) page_type == TAIL_PAGE)
+ {
+ tail_count+= real_row_count;
+ continue;
+ }
+ if (check_head_page(param, info, record, extend, pos, page_buff,
+ row_count))
+ goto err;
+ }
+
+ _ma_scan_end_block_record(info);
+
+ if (full_page_count != param->full_page_count)
+ _ma_check_print_error(param, "Full page count read through records was %s but we found %s pages while scanning table",
+ llstr(param->full_page_count, llbuff),
+ llstr(full_page_count, llbuff2));
+ if (tail_count != param->tail_count)
+ _ma_check_print_error(param, "Tail count read through records was %s but we found %s tails while scanning table",
+ llstr(param->tail_count, llbuff),
+ llstr(tail_count, llbuff2));
+
+ /* Update splits to avoid warning */
+ info->s->state.split= param->splits;
+ info->state->del= param->del_blocks;
+ return param->error_printed != 0;
+
+err:
+ _ma_scan_end_block_record(info);
+ return 1;
+}
+
+
+/* Check that record-link is ok */
+
+int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info,int extend)
+{
+ int error;
+ uchar *record;
+ char llbuff[22],llbuff2[22],llbuff3[22];
+ DBUG_ENTER("maria_chk_data_link");
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (extend)
+ puts("- check records and index references");
+ else
+ puts("- check record links");
+ }
+
+ if (!(record= (uchar*) my_malloc(info->s->base.pack_reclength,MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for record");
+ DBUG_RETURN(-1);
+ }
+ param->records= param->del_blocks= 0;
+ param->used= param->link_used= param->splits= param->del_length= 0;
+ param->tmp_record_checksum= param->glob_crc= 0;
+ param->err_count= 0;
+
+ error= 0;
+ param->empty= info->s->pack.header_length;
+
+ bzero((char*) param->tmp_key_crc,
+ info->s->base.keys * sizeof(param->tmp_key_crc[0]));
+
+ switch (info->s->data_file_type) {
+ case BLOCK_RECORD:
+ error= check_block_record(param, info, extend, record);
+ break;
+ case STATIC_RECORD:
+ error= check_static_record(param, info, extend, record);
+ break;
+ case DYNAMIC_RECORD:
+ error= check_dynamic_record(param, info, extend, record);
+ break;
+ case COMPRESSED_RECORD:
+ error= check_compressed_record(param, info, extend, record);
+ break;
+ } /* switch */
+
+ if (error)
+ goto err;
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ if (param->records != info->state->records)
+ {
+ _ma_check_print_error(param,
+ "Record-count is not ok; found %-10s Should be: %s",
+ llstr(param->records,llbuff),
+ llstr(info->state->records,llbuff2));
+ error=1;
+ }
+ else if (param->record_checksum &&
+ param->record_checksum != param->tmp_record_checksum)
+ {
+ _ma_check_print_error(param,
+ "Key pointers and record positions doesn't match");
+ error=1;
+ }
+ else if (param->glob_crc != info->state->checksum &&
+ (info->s->options &
+ (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+ {
+ _ma_check_print_warning(param,
+ "Record checksum is not the same as checksum stored in the index file\n");
+ error=1;
+ }
+ else if (!extend)
+ {
+ uint key;
+ for (key=0 ; key < info->s->base.keys; key++)
+ {
+ if (param->tmp_key_crc[key] != param->key_crc[key] &&
+ !(info->s->keyinfo[key].flag & (HA_FULLTEXT | HA_SPATIAL)))
+ {
+ _ma_check_print_error(param,"Checksum for key: %2d doesn't match checksum for records",
+ key+1);
+ error=1;
+ }
+ }
+ }
+
+ if (param->del_length != info->state->empty)
+ {
+ _ma_check_print_warning(param,
+ "Found %s deleted space. Should be %s",
+ llstr(param->del_length,llbuff2),
+ llstr(info->state->empty,llbuff));
+ }
+ if (param->used + param->empty + param->del_length !=
+ info->state->data_file_length)
+ {
+ _ma_check_print_warning(param,
+ "Found %s record data and %s unused data and %s deleted data",
+ llstr(param->used, llbuff),
+ llstr(param->empty,llbuff2),
+ llstr(param->del_length,llbuff3));
+ _ma_check_print_warning(param,
+ "Total %s Should be: %s",
+ llstr((param->used+param->empty+param->del_length),
+ llbuff),
+ llstr(info->state->data_file_length,llbuff2));
+ }
+ if (param->del_blocks != info->state->del)
+ {
+ _ma_check_print_warning(param,
+ "Found %10s deleted blocks Should be: %s",
+ llstr(param->del_blocks,llbuff),
+ llstr(info->state->del,llbuff2));
+ }
+ if (param->splits != info->s->state.split)
+ {
+ _ma_check_print_warning(param,
+ "Found %10s parts Should be: %s parts",
+ llstr(param->splits, llbuff),
+ llstr(info->s->state.split,llbuff2));
+ }
+ if (param->testflag & T_INFO)
+ {
+ if (param->warning_printed || param->error_printed)
+ puts("");
+ if (param->used != 0 && ! param->error_printed)
+ {
+ if (param->records)
+ {
+ printf("Records:%18s M.recordlength:%9lu Packed:%14.0f%%\n",
+ llstr(param->records,llbuff),
+ (long)((param->used - param->link_used)/param->records),
+ (info->s->base.blobs ? 0.0 :
+ (ulonglong2double((ulonglong) info->s->base.reclength *
+ param->records)-
+ my_off_t2double(param->used))/
+ ulonglong2double((ulonglong) info->s->base.reclength *
+ param->records)*100.0));
+ printf("Recordspace used:%9.0f%% Empty space:%12d%% Blocks/Record: %6.2f\n",
+ (ulonglong2double(param->used - param->link_used)/
+ ulonglong2double(param->used-param->link_used+param->empty)*100.0),
+ (!param->records ? 100 :
+ (int) (ulonglong2double(param->del_length+param->empty)/
+ my_off_t2double(param->used)*100.0)),
+ ulonglong2double(param->splits - param->del_blocks) /
+ param->records);
+ }
+ else
+ printf("Records:%18s\n", "0");
+ }
+ printf("Record blocks:%12s Delete blocks:%10s\n",
+ llstr(param->splits - param->del_blocks, llbuff),
+ llstr(param->del_blocks, llbuff2));
+ printf("Record data: %12s Deleted data: %10s\n",
+ llstr(param->used - param->link_used,llbuff),
+ llstr(param->del_length, llbuff2));
+ printf("Lost space: %12s Linkdata: %10s\n",
+ llstr(param->empty, llbuff),llstr(param->link_used, llbuff2));
+ }
+ my_free((uchar*) record,MYF(0));
+ DBUG_RETURN (error);
+
+ err:
+ my_free((uchar*) record,MYF(0));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1);
+} /* maria_chk_data_link */
+
+
+/*
+ Recover old table by reading each record and writing all keys
+
+ NOTES
+ Save new datafile-name in temp_filename.
+ We overwrite the index file as we go (writekeys() for example), so if we
+ crash during this the table is unusable and user (or Recovery in the
+ future) must repeat the REPAIR/OPTIMIZE operation. We could use a
+ temporary index file in the future (drawback: more disk space).
+
+ IMPLEMENTATION (for hard repair with block format)
+ - Create new, unrelated MARIA_HA of the table
+ - Create new datafile and associate it with new handler
+ - Reset all statistic information in new handler
+ - Copy all data to new handler with normal write operations
+ - Move state of new handler to old handler
+ - Close new handler
+ - Close data file in old handler
+ - Rename old data file to new data file.
+ - Reopen data file in old handler
+*/
+
+int maria_repair(HA_CHECK *param, register MARIA_HA *info,
+ char *name, int rep_quick)
+{
+ int error, got_error= 1;
+ uint i;
+ ha_rows start_records,new_header_length;
+ my_off_t del;
+ File new_file;
+ MARIA_SHARE *share=info->s;
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO sort_info;
+ MARIA_SORT_PARAM sort_param;
+ my_bool block_record, scan_inited= 0;
+ enum data_file_type org_data_file_type= info->s->data_file_type;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ DBUG_ENTER("maria_repair");
+
+ bzero((char *)&sort_info, sizeof(sort_info));
+ bzero((char *)&sort_param, sizeof(sort_param));
+ start_records=info->state->records;
+ new_header_length=(param->testflag & T_UNPACK) ? 0L :
+ share->pack.header_length;
+ new_file= -1;
+ sort_param.sort_info=&sort_info;
+ block_record= org_data_file_type == BLOCK_RECORD;
+ sort_info.info= sort_info.new_info= info;
+ bzero(&info->rec_cache,sizeof(info->rec_cache));
+
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- recovering (with keycache) MARIA-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(info->state->records,llbuff));
+ }
+ param->testflag|=T_REP; /* for easy checking */
+
+ if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ param->testflag|=T_CALC_CHECKSUM;
+
+ /*
+ The physical size of the data file is sometimes used during repair (see
+ sort_info.filelength further below); we need to flush to have it exact.
+ We flush the state because our maria_open(HA_OPEN_COPY) will want to read
+ it from disk. Index file will be recreated.
+ */
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE, FLUSH_IGNORE_CHANGED) ||
+ _ma_state_info_write(share, 1|2|4))
+ goto err;
+
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file= my_create(fn_format(param->temp_filename,
+ share->data_file_name, "",
+ DATA_TMP_EXT, 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file, 0L,
+ new_header_length, "datafile-header"))
+ goto err;
+ info->s->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file= new_file;
+ if (share->data_file_type == BLOCK_RECORD ||
+ ((param->testflag & T_UNPACK) &&
+ share->state.header.org_data_file_type == BLOCK_RECORD))
+ {
+ MARIA_HA *new_info;
+ /*
+ It's ok for Recovery to have two MARIA_SHARE on the same index file
+ because the one below is not transactional
+ */
+ if (!(sort_info.new_info= maria_open(info->s->open_file_name, O_RDWR,
+ HA_OPEN_COPY | HA_OPEN_FOR_REPAIR)))
+ goto err;
+ new_info= sort_info.new_info;
+ change_data_file_descriptor(new_info, new_file);
+ maria_lock_database(new_info, F_EXTRA_LCK);
+ if ((param->testflag & T_UNPACK) &&
+ share->data_file_type == COMPRESSED_RECORD)
+ {
+ (*new_info->s->once_end)(new_info->s);
+ (*new_info->s->end)(new_info);
+ restore_data_file_type(new_info->s);
+ _ma_setup_functions(new_info->s);
+ if ((*new_info->s->once_init)(new_info->s, new_file) ||
+ (*new_info->s->init)(new_info))
+ goto err;
+ }
+ _ma_reset_status(sort_info.new_info);
+ if (_ma_initialize_data_file(sort_info.new_info->s, new_file))
+ goto err;
+ block_record= 1;
+ }
+ }
+
+ if (org_data_file_type != BLOCK_RECORD)
+ {
+ /* We need a read buffer to read rows in big blocks */
+ if (init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)))
+ goto err;
+ }
+ if (sort_info.new_info->s->data_file_type != BLOCK_RECORD)
+ {
+ /* When writing to not block records, we need a write buffer */
+ if (!rep_quick)
+ if (init_io_cache(&info->rec_cache, new_file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_WAIT_IF_FULL)))
+ goto err;
+ info->opt_flag|=WRITE_CACHE_USED;
+ }
+ else
+ {
+ scan_inited= 1;
+ if (maria_scan_init(sort_info.info))
+ goto err;
+ }
+
+ if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength,
+ MYF(0))) ||
+ _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+ info->s->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param, "Not enough memory for extra record");
+ goto err;
+ }
+
+ sort_info.param = param;
+ sort_param.read_cache=param->read_cache;
+ sort_param.pos=sort_param.max_pos=share->pack.header_length;
+ sort_param.filepos=new_header_length;
+ param->read_cache.end_of_file=sort_info.filelength=
+ my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+ sort_info.dupp=0;
+ sort_param.fix_datafile= (my_bool) (! rep_quick);
+ sort_param.master=1;
+ sort_info.max_records= ~(ha_rows) 0;
+
+ set_data_file_type(&sort_info, share);
+ del=info->state->del;
+ info->state->records=info->state->del=share->state.split=0;
+ info->state->empty=0;
+ param->glob_crc=0;
+ if (param->testflag & T_CALC_CHECKSUM)
+ sort_param.calc_checksum= 1;
+
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ /*
+ Clear all keys. Note that all key blocks allocated until now remain
+ "dead" parts of the key file. (Bug #4692)
+ */
+ for (i=0 ; i < info->s->base.keys ; i++)
+ share->state.key_root[i]= HA_OFFSET_ERROR;
+
+ /* Drop the delete chain. */
+ share->state.key_del= HA_OFFSET_ERROR;
+
+ /*
+ If requested, activate (enable) all keys in key_map. In this case,
+ all indexes will be (re-)built.
+ */
+ if (param->testflag & T_CREATE_MISSING_KEYS)
+ maria_set_all_keys_active(share->state.key_map, share->base.keys);
+
+ info->state->key_file_length=share->base.keystart;
+
+ maria_lock_memory(param); /* Everything is alloced */
+
+ sort_info.org_data_file_type= info->s->data_file_type;
+
+ /* Re-create all keys, which are set in key_map. */
+ while (!(error=sort_get_next_record(&sort_param)))
+ {
+ if (block_record && _ma_sort_write_record(&sort_param))
+ goto err;
+
+ if (writekeys(&sort_param))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+ goto err;
+ DBUG_DUMP("record",(uchar*) sort_param.record,share->base.pack_reclength);
+ _ma_check_print_info(param,"Duplicate key %2d for record at %10s against new record at %10s",
+ info->errkey+1,
+ llstr(sort_param.start_recpos,llbuff),
+ llstr(info->dup_key_pos,llbuff2));
+ if (param->testflag & T_VERBOSE)
+ {
+ VOID(_ma_make_key(info,(uint) info->errkey,info->lastkey,
+ sort_param.record,0L));
+ _ma_print_key(stdout,share->keyinfo[info->errkey].seg,info->lastkey,
+ USE_WHOLE_KEY);
+ }
+ sort_info.dupp++;
+ if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+ {
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ param->error_printed=1;
+ goto err;
+ }
+ /* purecov: begin tested */
+ if (block_record)
+ {
+ sort_info.new_info->state->records--;
+ if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info))
+ {
+ _ma_check_print_error(param,"Couldn't delete duplicate row");
+ goto err;
+ }
+ continue;
+ }
+ /* purecov: end */
+ }
+ if (!block_record && _ma_sort_write_record(&sort_param))
+ goto err;
+ }
+ if (error > 0 || maria_write_data_suffix(&sort_info, (my_bool)!rep_quick) ||
+ flush_io_cache(&info->rec_cache) || param->read_cache.error < 0)
+ goto err;
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0)))
+ {
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d",
+ my_errno);
+ goto err;
+ }
+
+ if (rep_quick && del+sort_info.dupp != info->state->del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (sort_info.new_info->state->records+1 < start_records)
+ {
+ info->state->records=start_records;
+ goto err;
+ }
+ }
+
+ if (!rep_quick)
+ {
+ if (sort_info.new_info != sort_info.info)
+ {
+ MARIA_STATE_INFO save_state= sort_info.new_info->s->state;
+ if (maria_close(sort_info.new_info))
+ {
+ _ma_check_print_error(param, "Got error %d on close", my_errno);
+ goto err;
+ }
+ copy_data_file_state(&info->s->state, &save_state);
+ new_file= -1;
+ }
+ else
+ info->state->data_file_length= sort_param.filepos;
+ share->state.version=(ulong) time((time_t*) 0); /* Force reopen */
+
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ my_close(new_file, MYF(MY_WME));
+ new_file= -1;
+ change_data_file_descriptor(info, -1);
+ if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ (param->testflag & T_BACKUP_DATA ?
+ MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) |
+ sync_dir) ||
+ _ma_open_datafile(info, share, -1))
+ {
+ goto err;
+ }
+ }
+ else
+ {
+ info->state->data_file_length= sort_param.max_pos;
+ }
+ if (param->testflag & T_CALC_CHECKSUM)
+ info->state->checksum= param->glob_crc;
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != info->state->records)
+ printf("Data records: %s\n", llstr(info->state->records,llbuff));
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+ }
+
+ got_error= 0;
+ /* If invoked by external program that uses thr_lock */
+ if (&share->state.state != info->state)
+ memcpy( &share->state.state, info->state, sizeof(*info->state));
+
+err:
+ if (scan_inited)
+ maria_scan_end(sort_info.info);
+
+ VOID(end_io_cache(&param->read_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ /* this below could fail, shouldn't we detect error? */
+ VOID(end_io_cache(&info->rec_cache));
+ got_error|= _ma_flush_table_files_after_repair(param, info);
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d for record at pos %s",my_errno,
+ llstr(sort_param.start_recpos,llbuff));
+ if (sort_info.new_info && sort_info.new_info != sort_info.info)
+ {
+ /**
+ @todo ASK_MONTY
+ grepping for "dfile.file="
+ shows several places (ma_check.c, ma_panic.c, ma_extra.c) where we
+ modify dfile.file without modifying share->bitmap.file.file; those
+ sound like bugs because the two variables are normally copies of each
+ other in BLOCK_RECORD (and in other record formats it does not hurt
+ to change the unused share->bitmap.file.file).
+ It does matter, because if we close dfile.file, set dfile.file to -1,
+ but leave bitmap.file.file to its positive value, maria_close() will
+ close a file which it is not allowed to (maybe even a file in another
+ engine or mysqld!).
+ */
+ sort_info.new_info->dfile.file= -1;
+ maria_close(sort_info.new_info);
+ }
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ info->rec_cache.file=-1; /* don't flush data to new_file, it's closed */
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ else if (sync_dir)
+ {
+ /*
+ Now that we have flushed and forced everything, we can bump
+ create_rename_lsn:
+ */
+ write_log_record_for_repair(param, info);
+ }
+ my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ if (!got_error && (param->testflag & T_UNPACK))
+ restore_data_file_type(share);
+ share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES |
+ STATE_NOT_ANALYZED);
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS;
+ DBUG_RETURN(got_error);
+}
+
+
+/* Uppdate keyfile when doing repair */
+
+static int writekeys(MARIA_SORT_PARAM *sort_param)
+{
+ register uint i;
+ uchar *key;
+ MARIA_HA *info= sort_param->sort_info->info;
+ uchar *buff= sort_param->record;
+ my_off_t filepos= sort_param->filepos;
+ DBUG_ENTER("writekeys");
+
+ key= info->lastkey+info->s->base.max_key_length;
+ for (i=0 ; i < info->s->base.keys ; i++)
+ {
+ if (maria_is_key_active(info->s->state.key_map, i))
+ {
+ if (info->s->keyinfo[i].flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_add(info,i,(char*) key,buff,filepos))
+ goto err;
+ }
+#ifdef HAVE_SPATIAL
+ else if (info->s->keyinfo[i].flag & HA_SPATIAL)
+ {
+ uint key_length= _ma_make_key(info,i,key,buff,filepos);
+ if (maria_rtree_insert(info, i, key, key_length))
+ goto err;
+ }
+#endif /*HAVE_SPATIAL*/
+ else
+ {
+ uint key_length= _ma_make_key(info,i,key,buff,filepos);
+ if (_ma_ck_write(info,i,key,key_length))
+ goto err;
+ }
+ }
+ }
+ DBUG_RETURN(0);
+
+ err:
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY)
+ {
+ info->errkey=(int) i; /* This key was found */
+ while ( i-- > 0 )
+ {
+ if (maria_is_key_active(info->s->state.key_map, i))
+ {
+ if (info->s->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if (_ma_ft_del(info,i,(char*) key,buff,filepos))
+ break;
+ }
+ else
+ {
+ uint key_length= _ma_make_key(info,i,key,buff,filepos);
+ if (_ma_ck_delete(info,i,key,key_length))
+ break;
+ }
+ }
+ }
+ }
+ /* Remove checksum that was added to glob_crc in sort_get_next_record */
+ if (sort_param->calc_checksum)
+ sort_param->sort_info->param->glob_crc-= info->cur_row.checksum;
+ DBUG_PRINT("error",("errno: %d",my_errno));
+ DBUG_RETURN(-1);
+} /* writekeys */
+
+
+ /* Change all key-pointers that points to a records */
+
+int maria_movepoint(register MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos,
+ uint prot_key)
+{
+ register uint i;
+ uchar *key;
+ uint key_length;
+ DBUG_ENTER("maria_movepoint");
+
+ key= info->lastkey+info->s->base.max_key_length;
+ for (i=0 ; i < info->s->base.keys; i++)
+ {
+ if (i != prot_key && maria_is_key_active(info->s->state.key_map, i))
+ {
+ key_length= _ma_make_key(info,i,key,record,oldpos);
+ if (info->s->keyinfo[i].flag & HA_NOSAME)
+ { /* Change pointer direct */
+ uint nod_flag;
+ MARIA_KEYDEF *keyinfo;
+ keyinfo=info->s->keyinfo+i;
+ if (_ma_search(info,keyinfo,key,USE_WHOLE_KEY,
+ (uint) (SEARCH_SAME | SEARCH_SAVE_BUFF),
+ info->s->state.key_root[i]))
+ DBUG_RETURN(-1);
+ nod_flag=_ma_test_if_nod(info->buff);
+ _ma_dpointer(info,info->int_keypos-nod_flag-
+ info->s->rec_reflength,newpos);
+ if (_ma_write_keypage(info,keyinfo,info->last_keypage,
+ DFLT_INIT_HITS,info->buff))
+ DBUG_RETURN(-1);
+ }
+ else
+ { /* Change old key to new */
+ if (_ma_ck_delete(info,i,key,key_length))
+ DBUG_RETURN(-1);
+ key_length= _ma_make_key(info,i,key,record,newpos);
+ if (_ma_ck_write(info,i,key,key_length))
+ DBUG_RETURN(-1);
+ }
+ }
+ }
+ DBUG_RETURN(0);
+} /* maria_movepoint */
+
+
+ /* Tell system that we want all memory for our cache */
+
+void maria_lock_memory(HA_CHECK *param __attribute__((unused)))
+{
+#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */
+ if (param->opt_maria_lock_memory)
+ {
+ int success = mlockall(MCL_CURRENT); /* or plock(DATLOCK); */
+ if (geteuid() == 0 && success != 0)
+ _ma_check_print_warning(param,
+ "Failed to lock memory. errno %d",my_errno);
+ }
+#endif
+} /* maria_lock_memory */
+
+
+/**
+ Flush all changed blocks to disk so that we can say "at the end of repair,
+ the table is fully ok on disk".
+
+ It is a requirement for transactional tables.
+ We release blocks as it's unlikely that they would all be needed soon.
+
+ @param param description of the repair operation
+ @param info table
+*/
+
+int _ma_flush_table_files_after_repair(HA_CHECK *param, MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE) ||
+ _ma_state_info_write(share, 1|4) ||
+ (share->base.born_transactional && _ma_sync_table_files(info)))
+ {
+ _ma_check_print_error(param,"%d when trying to write bufferts",my_errno);
+ return 1;
+ }
+ return 0;
+} /* _ma_flush_table_files_after_repair */
+
+
+ /* Sort index for more efficent reads */
+
+int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+ reg2 uint key;
+ reg1 MARIA_KEYDEF *keyinfo;
+ File new_file;
+ my_off_t index_pos[HA_MAX_POSSIBLE_KEY];
+ uint r_locks,w_locks;
+ int old_lock;
+ MARIA_SHARE *share=info->s;
+ MARIA_STATE_INFO old_state;
+ myf sync_dir= (share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0;
+ DBUG_ENTER("maria_sort_index");
+
+ /* cannot sort index files with R-tree indexes */
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ key++,keyinfo++)
+ if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+ DBUG_RETURN(0);
+
+ if (!(param->testflag & T_SILENT))
+ printf("- Sorting index for MARIA-table '%s'\n",name);
+
+ /* Get real path for index file */
+ fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32);
+ if ((new_file=my_create(fn_format(param->temp_filename,param->temp_filename,
+ "", INDEX_TMP_EXT,2+4),
+ 0,param->tmpfile_createflag,MYF(0))) <= 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ DBUG_RETURN(-1);
+ }
+ if (maria_filecopy(param, new_file, share->kfile.file, 0L,
+ (ulong) share->base.keystart, "headerblock"))
+ goto err;
+
+ param->new_file_pos=share->base.keystart;
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ key++,keyinfo++)
+ {
+ if (! maria_is_key_active(info->s->state.key_map, key))
+ continue;
+
+ if (share->state.key_root[key] != HA_OFFSET_ERROR)
+ {
+ index_pos[key]=param->new_file_pos; /* Write first block here */
+ if (sort_one_index(param,info,keyinfo,share->state.key_root[key],
+ new_file))
+ goto err;
+ }
+ else
+ index_pos[key]= HA_OFFSET_ERROR; /* No blocks */
+ }
+
+ /* Flush key cache for this file if we are calling this outside maria_chk */
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_IGNORE_CHANGED);
+
+ share->state.version=(ulong) time((time_t*) 0);
+ old_state= share->state; /* save state if not stored */
+ r_locks= share->r_locks;
+ w_locks= share->w_locks;
+ old_lock= info->lock_type;
+
+ /* Put same locks as old file */
+ share->r_locks= share->w_locks= share->tot_locks= 0;
+ (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE);
+ pthread_mutex_lock(&share->intern_lock);
+ VOID(my_close(share->kfile.file, MYF(MY_WME)));
+ share->kfile.file = -1;
+ pthread_mutex_unlock(&share->intern_lock);
+ VOID(my_close(new_file,MYF(MY_WME)));
+ if (maria_change_to_newfile(share->index_file_name, MARIA_NAME_IEXT,
+ INDEX_TMP_EXT, sync_dir) ||
+ _ma_open_keyfile(share))
+ goto err2;
+ info->lock_type= F_UNLCK; /* Force maria_readinfo to lock */
+ _ma_readinfo(info,F_WRLCK,0); /* Will lock the table */
+ info->lock_type= old_lock;
+ share->r_locks= r_locks;
+ share->w_locks= w_locks;
+ share->tot_locks= r_locks+w_locks;
+ share->state= old_state; /* Restore old state */
+
+ info->state->key_file_length=param->new_file_pos;
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ for (key=0 ; key < info->s->base.keys ; key++)
+ info->s->state.key_root[key]=index_pos[key];
+ info->s->state.key_del= HA_OFFSET_ERROR;
+
+ info->s->state.changed&= ~STATE_NOT_SORTED_PAGES;
+ DBUG_RETURN(0);
+
+err:
+ VOID(my_close(new_file,MYF(MY_WME)));
+err2:
+ VOID(my_delete(param->temp_filename,MYF(MY_WME)));
+ DBUG_RETURN(-1);
+} /* maria_sort_index */
+
+
+ /* Sort records recursive using one index */
+
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t pagepos, File new_file)
+{
+ uint length,nod_flag,used_length, key_length;
+ uchar *buff,*keypos,*endpos;
+ uchar key[HA_MAX_POSSIBLE_KEY_BUFF];
+ my_off_t new_page_pos,next_page;
+ char llbuff[22];
+ DBUG_ENTER("sort_one_index");
+
+ /* cannot walk over R-tree indices */
+ DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE);
+ new_page_pos=param->new_file_pos;
+ param->new_file_pos+=keyinfo->block_length;
+
+ if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not enough memory for key block");
+ DBUG_RETURN(-1);
+ }
+ if (!_ma_fetch_keypage(info,keyinfo,pagepos,DFLT_INIT_HITS,buff,0))
+ {
+ _ma_check_print_error(param,"Can't read key block from filepos: %s",
+ llstr(pagepos,llbuff));
+ goto err;
+ }
+ if ((nod_flag=_ma_test_if_nod(buff)) || keyinfo->flag & HA_FULLTEXT)
+ {
+ used_length= maria_data_on_page(buff);
+ keypos=buff+2+nod_flag;
+ endpos=buff+used_length;
+ for ( ;; )
+ {
+ if (nod_flag)
+ {
+ next_page= _ma_kpos(nod_flag,keypos);
+ /* Save new pos */
+ _ma_kpointer(info,keypos-nod_flag,param->new_file_pos);
+ if (sort_one_index(param,info,keyinfo,next_page,new_file))
+ {
+ DBUG_PRINT("error",
+ ("From page: %ld, keyoffset: %lu used_length: %d",
+ (ulong) pagepos, (ulong) (keypos - buff),
+ (int) used_length));
+ DBUG_DUMP("buff",(uchar*) buff,used_length);
+ goto err;
+ }
+ }
+ if (keypos >= endpos ||
+ (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,key)) == 0)
+ break;
+ DBUG_ASSERT(keypos <= endpos);
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ uint off;
+ int subkeys;
+ get_key_full_length_rdonly(off, key);
+ subkeys=ft_sintXkorr(key+off);
+ if (subkeys < 0)
+ {
+ next_page= _ma_dpos(info,0,key+key_length);
+ _ma_dpointer(info,keypos-nod_flag-info->s->rec_reflength,
+ param->new_file_pos); /* Save new pos */
+ if (sort_one_index(param,info,&info->s->ft2_keyinfo,
+ next_page,new_file))
+ goto err;
+ }
+ }
+ }
+ }
+
+ /* Fill block with zero and write it to the new index file */
+ length= maria_data_on_page(buff);
+ bzero((uchar*) buff+length,keyinfo->block_length-length);
+ if (my_pwrite(new_file,(uchar*) buff,(uint) keyinfo->block_length,
+ new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno);
+ goto err;
+ }
+ my_afree((uchar*) buff);
+ DBUG_RETURN(0);
+err:
+ my_afree((uchar*) buff);
+ DBUG_RETURN(1);
+} /* sort_one_index */
+
+
+ /*
+ Let temporary file replace old file.
+ This assumes that the new file was created in the same
+ directory as given by realpath(filename).
+ This will ensure that any symlinks that are used will still work.
+ Copy stats from old file to new file, deletes orignal and
+ changes new file name to old file name
+ */
+
+int maria_change_to_newfile(const char * filename, const char * old_ext,
+ const char * new_ext, myf MyFlags)
+{
+ char old_filename[FN_REFLEN],new_filename[FN_REFLEN];
+#ifdef USE_RAID
+ if (raid_chunks)
+ return my_raid_redel(fn_format(old_filename,filename,"",old_ext,2+4),
+ fn_format(new_filename,filename,"",new_ext,2+4),
+ raid_chunks,
+ MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+#endif
+ /* Get real path to filename */
+ (void) fn_format(old_filename,filename,"",old_ext,2+4+32);
+ return my_redel(old_filename,
+ fn_format(new_filename,old_filename,"",new_ext,2+4),
+ MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+} /* maria_change_to_newfile */
+
+
+/* Copy a block between two files */
+
+int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start,
+ my_off_t length, const char *type)
+{
+ char tmp_buff[IO_SIZE],*buff;
+ ulong buff_length;
+ DBUG_ENTER("maria_filecopy");
+
+ buff_length=(ulong) min(param->write_buffer_length,length);
+ if (!(buff=my_malloc(buff_length,MYF(0))))
+ {
+ buff=tmp_buff; buff_length=IO_SIZE;
+ }
+
+ VOID(my_seek(from,start,MY_SEEK_SET,MYF(0)));
+ while (length > buff_length)
+ {
+ if (my_read(from,(uchar*) buff,buff_length,MYF(MY_NABP)) ||
+ my_write(to,(uchar*) buff,buff_length,param->myf_rw))
+ goto err;
+ length-= buff_length;
+ }
+ if (my_read(from,(uchar*) buff,(uint) length,MYF(MY_NABP)) ||
+ my_write(to,(uchar*) buff,(uint) length,param->myf_rw))
+ goto err;
+ if (buff != tmp_buff)
+ my_free(buff,MYF(0));
+ DBUG_RETURN(0);
+err:
+ if (buff != tmp_buff)
+ my_free(buff,MYF(0));
+ _ma_check_print_error(param,"Can't copy %s to tempfile, error %d",
+ type,my_errno);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Repair table or given index using sorting
+
+ SYNOPSIS
+ maria_repair_by_sort()
+ param Repair parameters
+ info MARIA handler to repair
+ name Name of table (for warnings)
+ rep_quick set to <> 0 if we should not change data file
+
+ RESULT
+ 0 ok
+ <>0 Error
+*/
+
+int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
+ const char * name, int rep_quick)
+{
+ int got_error;
+ uint i;
+ ulong length;
+ ha_rows start_records;
+ my_off_t new_header_length, org_header_length, del;
+ File new_file;
+ MARIA_SORT_PARAM sort_param;
+ MARIA_SHARE *share=info->s;
+ HA_KEYSEG *keyseg;
+ ulong *rec_per_key_part;
+ char llbuff[22];
+ MARIA_SORT_INFO sort_info;
+ ulonglong key_map=share->state.key_map;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ DBUG_ENTER("maria_repair_by_sort");
+
+ bzero((char*)&sort_info,sizeof(sort_info));
+ bzero((char *)&sort_param, sizeof(sort_param));
+
+ start_records=info->state->records;
+ got_error=1;
+ new_file= -1;
+ org_header_length= share->pack.header_length;
+ new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length;
+
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- recovering (with sort) MARIA-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(start_records,llbuff));
+ }
+ param->testflag|=T_REP; /* for easy checking */
+
+ if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ param->testflag|=T_CALC_CHECKSUM;
+
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE,
+ FLUSH_KEEP))
+ goto err;
+
+ if (!(sort_info.key_block=
+ alloc_key_blocks(param,
+ (uint) param->sort_key_blocks,
+ share->base.max_key_block_length)) ||
+ init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, org_header_length, 1, MYF(MY_WME)) ||
+ (! rep_quick &&
+ init_io_cache(&info->rec_cache, info->dfile.file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE,new_header_length,1,
+ MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)))
+ goto err;
+ sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+ info->opt_flag|=WRITE_CACHE_USED;
+ info->rec_cache.file= info->dfile.file; /* for sort_delete_record */
+ sort_info.org_data_file_type= info->s->data_file_type;
+
+ if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength,
+ MYF(0))) ||
+ _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+ info->s->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param, "Not enough memory for extra record");
+ goto err;
+ }
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file=my_create(fn_format(param->temp_filename,
+ share->data_file_name, "",
+ DATA_TMP_EXT, 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file, 0L,
+ new_header_length, "datafile-header"))
+ goto err;
+ if (param->testflag & T_UNPACK)
+ restore_data_file_type(share);
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file=new_file;
+ }
+
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if (!(param->testflag & T_CREATE_MISSING_KEYS))
+ {
+ /*
+ Flush key cache for this file if we are calling this outside
+ maria_chk
+ */
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_IGNORE_CHANGED);
+ /* Clear the pointers to the given rows */
+ for (i=0 ; i < share->base.keys ; i++)
+ share->state.key_root[i]= HA_OFFSET_ERROR;
+ share->state.key_del= HA_OFFSET_ERROR;
+ info->state->key_file_length=share->base.keystart;
+ }
+ else
+ {
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_FORCE_WRITE))
+ goto err;
+ key_map= ~key_map; /* Create the missing keys */
+ }
+
+ sort_info.info= sort_info.new_info= info;
+ sort_info.param= param;
+
+ set_data_file_type(&sort_info, share);
+ sort_param.filepos=new_header_length;
+ sort_info.dupp=0;
+ sort_info.buff=0;
+ param->read_cache.end_of_file=sort_info.filelength=
+ my_seek(param->read_cache.file,0L,MY_SEEK_END,MYF(0));
+
+ sort_param.wordlist=NULL;
+ init_alloc_root(&sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+
+ if (sort_info.org_data_file_type == DYNAMIC_RECORD)
+ length=max(share->base.min_pack_length+1,share->base.min_block_length);
+ else if (sort_info.org_data_file_type == COMPRESSED_RECORD)
+ length=share->base.min_block_length;
+ else
+ length=share->base.pack_reclength;
+ sort_info.max_records=
+ ((param->testflag & T_CREATE_MISSING_KEYS) ? info->state->records :
+ (ha_rows) (sort_info.filelength/length+1));
+ sort_param.key_cmp=sort_key_cmp;
+ sort_param.lock_in_memory=maria_lock_memory;
+ sort_param.tmpdir=param->tmpdir;
+ sort_param.sort_info=&sort_info;
+ sort_param.fix_datafile= (my_bool) (! rep_quick);
+ sort_param.master =1;
+
+ del=info->state->del;
+ param->glob_crc=0;
+ if (param->testflag & T_CALC_CHECKSUM)
+ sort_param.calc_checksum= 1;
+
+ rec_per_key_part= param->rec_per_key_part;
+ for (sort_param.key=0 ; sort_param.key < share->base.keys ;
+ rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++)
+ {
+ sort_param.read_cache=param->read_cache;
+ sort_param.keyinfo=share->keyinfo+sort_param.key;
+ sort_param.seg=sort_param.keyinfo->seg;
+ if (! maria_is_key_active(key_map, sort_param.key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part +
+ (uint) (rec_per_key_part - param->rec_per_key_part)),
+ sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part));
+ continue;
+ }
+
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- Fixing index %d\n",sort_param.key+1);
+ sort_param.max_pos= sort_param.pos= org_header_length;
+ keyseg=sort_param.seg;
+ bzero((char*) sort_param.unique,sizeof(sort_param.unique));
+ sort_param.key_length=share->rec_reflength;
+ for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++)
+ {
+ sort_param.key_length+=keyseg[i].length;
+ if (keyseg[i].flag & HA_SPACE_PACK)
+ sort_param.key_length+=get_pack_length(keyseg[i].length);
+ if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ sort_param.key_length+=2 + test(keyseg[i].length >= 127);
+ if (keyseg[i].flag & HA_NULL_PART)
+ sort_param.key_length++;
+ }
+ info->state->records=info->state->del=share->state.split=0;
+ info->state->empty=0;
+
+ if (sort_param.keyinfo->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+ sort_param.keyinfo->seg->charset->mbmaxlen;
+ sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ /*
+ fulltext indexes may have much more entries than the
+ number of rows in the table. We estimate the number here.
+
+ Note, built-in parser is always nr. 0 - see ftparser_call_initializer()
+ */
+ if (sort_param.keyinfo->ftparser_nr == 0)
+ {
+ /*
+ for built-in parser the number of generated index entries
+ cannot be larger than the size of the data file divided
+ by the minimal word's length
+ */
+ sort_info.max_records=
+ (ha_rows) (sort_info.filelength/ft_min_word_len+1);
+ }
+ else
+ {
+ /*
+ for external plugin parser we cannot tell anything at all :(
+ so, we'll use all the sort memory and start from ~10 buffpeks.
+ (see _create_index_by_sort)
+ */
+ sort_info.max_records=
+ 10*param->sort_buffer_length/sort_param.key_length;
+ }
+
+ sort_param.key_read= sort_maria_ft_key_read;
+ sort_param.key_write= sort_maria_ft_key_write;
+ }
+ else
+ {
+ sort_param.key_read= sort_key_read;
+ sort_param.key_write= sort_key_write;
+ }
+
+ if (_ma_create_index_by_sort(&sort_param,
+ (my_bool) (!(param->testflag & T_VERBOSE)),
+ (uint) param->sort_buffer_length))
+ {
+ param->retry_repair=1;
+ goto err;
+ }
+ /* No need to calculate checksum again. */
+ sort_param.calc_checksum= 0;
+ free_root(&sort_param.wordroot, MYF(0));
+
+ /* Set for next loop */
+ sort_info.max_records= (ha_rows) info->state->records;
+
+ if (param->testflag & T_STATISTICS)
+ maria_update_key_parts(sort_param.keyinfo, rec_per_key_part, sort_param.unique,
+ param->stats_method == MI_STATS_METHOD_IGNORE_NULLS?
+ sort_param.notnull: NULL,(ulonglong) info->state->records);
+ maria_set_key_active(share->state.key_map, sort_param.key);
+
+ if (sort_param.fix_datafile)
+ {
+ param->read_cache.end_of_file=sort_param.filepos;
+ if (maria_write_data_suffix(&sort_info,1) || end_io_cache(&info->rec_cache))
+ goto err;
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (info->state->records+1 < start_records)
+ {
+ info->state->records=start_records;
+ goto err;
+ }
+ }
+ share->state.state.data_file_length = info->state->data_file_length=
+ sort_param.filepos;
+ /* Only whole records */
+ share->state.version=(ulong) time((time_t*) 0);
+ my_close(info->dfile.file, MYF(0));
+ info->dfile.file= new_file;
+ share->data_file_type= sort_info.new_data_file_type;
+ org_header_length= (ulong) new_header_length;
+ sort_info.org_data_file_type= info->s->data_file_type;
+ sort_param.fix_datafile=0;
+ }
+ else
+ info->state->data_file_length=sort_param.max_pos;
+
+ param->read_cache.file= info->dfile.file; /* re-init read cache */
+ reinit_io_cache(&param->read_cache,READ_CACHE,share->pack.header_length,
+ 1,1);
+ }
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+
+ if (rep_quick && del+sort_info.dupp != info->state->del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ got_error=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (rep_quick & T_FORCE_UNIQUENESS)
+ {
+ my_off_t skr= (info->state->data_file_length +
+ (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+ MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+ if (sort_info.org_data_file_type == STATIC_RECORD &&
+ skr < share->base.reloc*share->base.min_pack_length)
+ skr=share->base.reloc*share->base.min_pack_length;
+#endif
+ if (skr != sort_info.filelength)
+ if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of datafile, error: %d",
+ my_errno);
+ }
+ if (param->testflag & T_CALC_CHECKSUM)
+ info->state->checksum=param->glob_crc;
+
+ if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d",
+ my_errno);
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != info->state->records)
+ printf("Data records: %s\n", llstr(info->state->records,llbuff));
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+ }
+ got_error=0;
+
+ if (&share->state.state != info->state)
+ memcpy( &share->state.state, info->state, sizeof(*info->state));
+
+err:
+ VOID(end_io_cache(&info->rec_cache));
+ VOID(end_io_cache(&param->read_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ got_error|= _ma_flush_table_files_after_repair(param, info);
+ if (!got_error)
+ {
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ {
+ my_close(new_file,MYF(0));
+ info->dfile.file= new_file= -1;
+ if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ MYF((param->testflag & T_BACKUP_DATA ?
+ MY_REDEL_MAKE_BACKUP : 0) |
+ sync_dir)) ||
+ _ma_open_datafile(info,share,-1))
+ got_error=1;
+ }
+ }
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d when fixing table",my_errno);
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ if (info->dfile.file == new_file)
+ info->dfile.file= -1;
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ else if (key_map == share->state.key_map)
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS;
+
+ my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar*) sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar*) sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ if (!got_error && (param->testflag & T_UNPACK))
+ restore_data_file_type(share);
+ DBUG_RETURN(got_error);
+}
+
+/*
+ Threaded repair of table using sorting
+
+ SYNOPSIS
+ maria_repair_parallel()
+ param Repair parameters
+ info MARIA handler to repair
+ name Name of table (for warnings)
+ rep_quick set to <> 0 if we should not change data file
+
+ DESCRIPTION
+ Same as maria_repair_by_sort but do it multithreaded
+ Each key is handled by a separate thread.
+ TODO: make a number of threads a parameter
+
+ In parallel repair we use one thread per index. There are two modes:
+
+ Quick
+
+ Only the indexes are rebuilt. All threads share a read buffer.
+ Every thread that needs fresh data in the buffer enters the shared
+ cache lock. The last thread joining the lock reads the buffer from
+ the data file and wakes all other threads.
+
+ Non-quick
+
+ The data file is rebuilt and all indexes are rebuilt to point to
+ the new record positions. One thread is the master thread. It
+ reads from the old data file and writes to the new data file. It
+ also creates one of the indexes. The other threads read from a
+ buffer which is filled by the master. If they need fresh data,
+ they enter the shared cache lock. If the masters write buffer is
+ full, it flushes it to the new data file and enters the shared
+ cache lock too. When all threads joined in the lock, the master
+ copies its write buffer to the read buffer for the other threads
+ and wakes them.
+
+ RESULT
+ 0 ok
+ <>0 Error
+*/
+
+int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
+ const char * name, int rep_quick)
+{
+#ifndef THREAD
+ return maria_repair_by_sort(param, info, name, rep_quick);
+#else
+ int got_error;
+ uint i,key, total_key_length, istep;
+ ulong rec_length;
+ ha_rows start_records;
+ my_off_t new_header_length,del;
+ File new_file;
+ MARIA_SORT_PARAM *sort_param=0;
+ MARIA_SHARE *share=info->s;
+ ulong *rec_per_key_part;
+ HA_KEYSEG *keyseg;
+ char llbuff[22];
+ IO_CACHE new_data_cache; /* For non-quick repair. */
+ IO_CACHE_SHARE io_share;
+ MARIA_SORT_INFO sort_info;
+ ulonglong key_map=share->state.key_map;
+ pthread_attr_t thr_attr;
+ myf sync_dir= (share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0;
+ DBUG_ENTER("maria_repair_parallel");
+
+ start_records=info->state->records;
+ got_error=1;
+ new_file= -1;
+ new_header_length=(param->testflag & T_UNPACK) ? 0 :
+ share->pack.header_length;
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- parallel recovering (with sort) MARIA-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(start_records,llbuff));
+ }
+ param->testflag|=T_REP; /* for easy checking */
+
+ if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ param->testflag|=T_CALC_CHECKSUM;
+
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE,
+ FLUSH_KEEP))
+ goto err;
+
+ /*
+ Quick repair (not touching data file, rebuilding indexes):
+ {
+ Read cache is (MI_CHECK *param)->read_cache using info->dfile.file.
+ }
+
+ Non-quick repair (rebuilding data file and indexes):
+ {
+ Master thread:
+
+ Read cache is (MI_CHECK *param)->read_cache using info->dfile.file.
+ Write cache is (MI_INFO *info)->rec_cache using new_file.
+
+ Slave threads:
+
+ Read cache is new_data_cache synced to master rec_cache.
+
+ The final assignment of the filedescriptor for rec_cache is done
+ after the cache creation.
+
+ Don't check file size on new_data_cache, as the resulting file size
+ is not known yet.
+
+ As rec_cache and new_data_cache are synced, write_buffer_length is
+ used for the read cache 'new_data_cache'. Both start at the same
+ position 'new_header_length'.
+ }
+ */
+ DBUG_PRINT("info", ("is quick repair: %d", rep_quick));
+ bzero((char*)&sort_info,sizeof(sort_info));
+ /* Initialize pthread structures before goto err. */
+ pthread_mutex_init(&sort_info.mutex, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&sort_info.cond, 0);
+
+ sort_info.org_data_file_type= info->s->data_file_type;
+
+ if (!(sort_info.key_block=
+ alloc_key_blocks(param, (uint) param->sort_key_blocks,
+ share->base.max_key_block_length)) ||
+ init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)) ||
+ (!rep_quick &&
+ (init_io_cache(&info->rec_cache, info->dfile.file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw) ||
+ init_io_cache(&new_data_cache, -1,
+ (uint) param->write_buffer_length,
+ READ_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_DONT_CHECK_FILESIZE)))))
+ goto err;
+ sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+ info->opt_flag|=WRITE_CACHE_USED;
+ info->rec_cache.file= info->dfile.file; /* for sort_delete_record */
+
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file= my_create(fn_format(param->temp_filename,
+ share->data_file_name, "",
+ DATA_TMP_EXT,
+ 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length,
+ "datafile-header"))
+ goto err;
+ if (param->testflag & T_UNPACK)
+ restore_data_file_type(share);
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file=new_file;
+ }
+
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if (!(param->testflag & T_CREATE_MISSING_KEYS))
+ {
+ /*
+ Flush key cache for this file if we are calling this outside
+ maria_chk
+ */
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_IGNORE_CHANGED);
+ /* Clear the pointers to the given rows */
+ for (i=0 ; i < share->base.keys ; i++)
+ share->state.key_root[i]= HA_OFFSET_ERROR;
+ share->state.key_del= HA_OFFSET_ERROR;
+ info->state->key_file_length=share->base.keystart;
+ }
+ else
+ {
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_FORCE_WRITE))
+ goto err;
+ key_map= ~key_map; /* Create the missing keys */
+ }
+
+ sort_info.info= sort_info.new_info= info;
+ sort_info.param= param;
+
+ set_data_file_type(&sort_info, share);
+ sort_info.dupp=0;
+ sort_info.buff=0;
+ param->read_cache.end_of_file=sort_info.filelength=
+ my_seek(param->read_cache.file,0L,MY_SEEK_END,MYF(0));
+
+ if (sort_info.org_data_file_type == DYNAMIC_RECORD)
+ rec_length=max(share->base.min_pack_length+1,share->base.min_block_length);
+ else if (sort_info.org_data_file_type == COMPRESSED_RECORD)
+ rec_length=share->base.min_block_length;
+ else
+ rec_length=share->base.pack_reclength;
+ /*
+ +1 below is required hack for parallel repair mode.
+ The info->state->records value, that is compared later
+ to sort_info.max_records and cannot exceed it, is
+ increased in sort_key_write. In maria_repair_by_sort, sort_key_write
+ is called after sort_key_read, where the comparison is performed,
+ but in parallel mode master thread can call sort_key_write
+ before some other repair thread calls sort_key_read.
+ Furthermore I'm not even sure +1 would be enough.
+ May be sort_info.max_records shold be always set to max value in
+ parallel mode.
+ */
+ sort_info.max_records=
+ ((param->testflag & T_CREATE_MISSING_KEYS) ? info->state->records + 1:
+ (ha_rows) (sort_info.filelength/rec_length+1));
+
+ del=info->state->del;
+ param->glob_crc=0;
+
+ if (!(sort_param=(MARIA_SORT_PARAM *)
+ my_malloc((uint) share->base.keys *
+ (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength),
+ MYF(MY_ZEROFILL))))
+ {
+ _ma_check_print_error(param,"Not enough memory for key!");
+ goto err;
+ }
+ total_key_length=0;
+ rec_per_key_part= param->rec_per_key_part;
+ info->state->records=info->state->del=share->state.split=0;
+ info->state->empty=0;
+
+ for (i=key=0, istep=1 ; key < share->base.keys ;
+ rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++)
+ {
+ sort_param[i].key=key;
+ sort_param[i].keyinfo=share->keyinfo+key;
+ sort_param[i].seg=sort_param[i].keyinfo->seg;
+ if (! maria_is_key_active(key_map, key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part+
+ (uint) (rec_per_key_part - param->rec_per_key_part)),
+ sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part));
+ istep=0;
+ continue;
+ }
+ istep=1;
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- Fixing index %d\n",key+1);
+ if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+ {
+ sort_param[i].key_read=sort_maria_ft_key_read;
+ sort_param[i].key_write=sort_maria_ft_key_write;
+ }
+ else
+ {
+ sort_param[i].key_read=sort_key_read;
+ sort_param[i].key_write=sort_key_write;
+ }
+ sort_param[i].key_cmp=sort_key_cmp;
+ sort_param[i].lock_in_memory=maria_lock_memory;
+ sort_param[i].tmpdir=param->tmpdir;
+ sort_param[i].sort_info=&sort_info;
+ sort_param[i].master=0;
+ sort_param[i].fix_datafile=0;
+ sort_param[i].calc_checksum= 0;
+
+ sort_param[i].filepos=new_header_length;
+ sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length;
+
+ sort_param[i].record= (((char *)(sort_param+share->base.keys))+
+ (share->base.pack_reclength * i));
+ if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size,
+ share->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param,"Not enough memory!");
+ goto err;
+ }
+ sort_param[i].key_length=share->rec_reflength;
+ for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END;
+ keyseg++)
+ {
+ sort_param[i].key_length+=keyseg->length;
+ if (keyseg->flag & HA_SPACE_PACK)
+ sort_param[i].key_length+=get_pack_length(keyseg->length);
+ if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ sort_param[i].key_length+=2 + test(keyseg->length >= 127);
+ if (keyseg->flag & HA_NULL_PART)
+ sort_param[i].key_length++;
+ }
+ total_key_length+=sort_param[i].key_length;
+
+ if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+ sort_param[i].keyinfo->seg->charset->mbmaxlen;
+ sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ init_alloc_root(&sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+ }
+ }
+ sort_info.total_keys=i;
+ sort_param[0].master= 1;
+ sort_param[0].fix_datafile= (my_bool)(! rep_quick);
+ sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
+
+ sort_info.got_error=0;
+ pthread_mutex_lock(&sort_info.mutex);
+
+ /*
+ Initialize the I/O cache share for use with the read caches and, in
+ case of non-quick repair, the write cache. When all threads join on
+ the cache lock, the writer copies the write cache contents to the
+ read caches.
+ */
+ if (i > 1)
+ {
+ if (rep_quick)
+ init_io_cache_share(&param->read_cache, &io_share, NULL, i);
+ else
+ init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i);
+ }
+ else
+ io_share.total_threads= 0; /* share not used */
+
+ (void) pthread_attr_init(&thr_attr);
+ (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
+
+ for (i=0 ; i < sort_info.total_keys ; i++)
+ {
+ /*
+ Copy the properly initialized IO_CACHE structure so that every
+ thread has its own copy. In quick mode param->read_cache is shared
+ for use by all threads. In non-quick mode all threads but the
+ first copy the shared new_data_cache, which is synchronized to the
+ write cache of the first thread. The first thread copies
+ param->read_cache, which is not shared.
+ */
+ sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache :
+ new_data_cache);
+ DBUG_PRINT("io_cache_share", ("thread: %u read_cache: 0x%lx",
+ i, (long) &sort_param[i].read_cache));
+
+ /*
+ two approaches: the same amount of memory for each thread
+ or the memory for the same number of keys for each thread...
+ In the second one all the threads will fill their sort_buffers
+ (and call write_keys) at the same time, putting more stress on i/o.
+ */
+ sort_param[i].sortbuff_size=
+#ifndef USING_SECOND_APPROACH
+ param->sort_buffer_length/sort_info.total_keys;
+#else
+ param->sort_buffer_length*sort_param[i].key_length/total_key_length;
+#endif
+ if (pthread_create(&sort_param[i].thr, &thr_attr,
+ _ma_thr_find_all_keys,
+ (void *) (sort_param+i)))
+ {
+ _ma_check_print_error(param,"Cannot start a repair thread");
+ /* Cleanup: Detach from the share. Avoid others to be blocked. */
+ if (io_share.total_threads)
+ remove_io_thread(&sort_param[i].read_cache);
+ DBUG_PRINT("error", ("Cannot start a repair thread"));
+ sort_info.got_error=1;
+ }
+ else
+ sort_info.threads_running++;
+ }
+ (void) pthread_attr_destroy(&thr_attr);
+
+ /* waiting for all threads to finish */
+ while (sort_info.threads_running)
+ pthread_cond_wait(&sort_info.cond, &sort_info.mutex);
+ pthread_mutex_unlock(&sort_info.mutex);
+
+ if ((got_error= _ma_thr_write_keys(sort_param)))
+ {
+ param->retry_repair=1;
+ goto err;
+ }
+ got_error=1; /* Assume the following may go wrong */
+
+ if (sort_param[0].fix_datafile)
+ {
+ /*
+ Append some nuls to the end of a memory mapped file. Destroy the
+ write cache. The master thread did already detach from the share
+ by remove_io_thread() in sort.c:thr_find_all_keys().
+ */
+ if (maria_write_data_suffix(&sort_info,1) || end_io_cache(&info->rec_cache))
+ goto err;
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (info->state->records+1 < start_records)
+ {
+ info->state->records=start_records;
+ goto err;
+ }
+ }
+ share->state.state.data_file_length= info->state->data_file_length=
+ sort_param->filepos;
+ /* Only whole records */
+ share->state.version=(ulong) time((time_t*) 0);
+ /*
+ Exchange the data file descriptor of the table, so that we use the
+ new file from now on.
+ */
+ my_close(info->dfile.file, MYF(0));
+ info->dfile.file= new_file;
+ share->pack.header_length=(ulong) new_header_length;
+ }
+ else
+ info->state->data_file_length=sort_param->max_pos;
+
+ if (rep_quick && del+sort_info.dupp != info->state->del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (rep_quick & T_FORCE_UNIQUENESS)
+ {
+ my_off_t skr= (info->state->data_file_length +
+ (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+ MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+ if (sort_info.org_data_file_type == STATIC_RECORD &&
+ skr < share->base.reloc*share->base.min_pack_length)
+ skr=share->base.reloc*share->base.min_pack_length;
+#endif
+ if (skr != sort_info.filelength)
+ if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of datafile, error: %d",
+ my_errno);
+ }
+ if (param->testflag & T_CALC_CHECKSUM)
+ info->state->checksum=param->glob_crc;
+
+ if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d", my_errno);
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != info->state->records)
+ printf("Data records: %s\n", llstr(info->state->records,llbuff));
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+ }
+ got_error=0;
+
+ if (&share->state.state != info->state)
+ memcpy(&share->state.state, info->state, sizeof(*info->state));
+
+err:
+ /*
+ Destroy the write cache. The master thread did already detach from
+ the share by remove_io_thread() or it was not yet started (if the
+ error happend before creating the thread).
+ */
+ VOID(end_io_cache(&info->rec_cache));
+ VOID(end_io_cache(&param->read_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ /*
+ Destroy the new data cache in case of non-quick repair. All slave
+ threads did either detach from the share by remove_io_thread()
+ already or they were not yet started (if the error happend before
+ creating the threads).
+ */
+ if (!rep_quick)
+ VOID(end_io_cache(&new_data_cache));
+ got_error|= _ma_flush_table_files_after_repair(param, info);
+ if (!got_error)
+ {
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ {
+ my_close(new_file,MYF(0));
+ info->dfile.file= new_file= -1;
+ if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ MYF((param->testflag & T_BACKUP_DATA ?
+ MY_REDEL_MAKE_BACKUP : 0) |
+ sync_dir)) ||
+ _ma_open_datafile(info,share,-1))
+ got_error=1;
+ }
+ }
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d when fixing table",my_errno);
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ if (info->dfile.file == new_file)
+ info->dfile.file= -1;
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ else if (key_map == share->state.key_map)
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS;
+
+ pthread_cond_destroy (&sort_info.cond);
+ pthread_mutex_destroy(&sort_info.mutex);
+
+ my_free((uchar*) sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar*) sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar*) sort_param,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ if (!got_error && (param->testflag & T_UNPACK))
+ restore_data_file_type(share);
+ DBUG_RETURN(got_error);
+#endif /* THREAD */
+}
+
+ /* Read next record and return next key */
+
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+ int error;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ MARIA_HA *info= sort_info->info;
+ DBUG_ENTER("sort_key_read");
+
+ if ((error=sort_get_next_record(sort_param)))
+ DBUG_RETURN(error);
+ if (info->state->records == sort_info->max_records)
+ {
+ _ma_check_print_error(sort_info->param,
+ "Key %d - Found too many records; Can't continue",
+ sort_param->key+1);
+ DBUG_RETURN(1);
+ }
+ sort_param->real_key_length=
+ (info->s->rec_reflength+
+ _ma_make_key(info, sort_param->key, key,
+ sort_param->record, sort_param->filepos));
+#ifdef HAVE_purify
+ bzero(key+sort_param->real_key_length,
+ (sort_param->key_length-sort_param->real_key_length));
+#endif
+ DBUG_RETURN(_ma_sort_write_record(sort_param));
+} /* sort_key_read */
+
+
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+ int error;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ MARIA_HA *info=sort_info->info;
+ FT_WORD *wptr=0;
+ DBUG_ENTER("sort_maria_ft_key_read");
+
+ if (!sort_param->wordlist)
+ {
+ for (;;)
+ {
+ free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+ if ((error=sort_get_next_record(sort_param)))
+ DBUG_RETURN(error);
+ if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record,
+ &sort_param->wordroot)))
+
+ DBUG_RETURN(1);
+ if (wptr->pos)
+ break;
+ error=_ma_sort_write_record(sort_param);
+ }
+ sort_param->wordptr=sort_param->wordlist=wptr;
+ }
+ else
+ {
+ error=0;
+ wptr=(FT_WORD*)(sort_param->wordptr);
+ }
+
+ sort_param->real_key_length=(info->s->rec_reflength+
+ _ma_ft_make_key(info, sort_param->key,
+ key, wptr++,
+ sort_param->filepos));
+#ifdef HAVE_purify
+ if (sort_param->key_length > sort_param->real_key_length)
+ bzero(key+sort_param->real_key_length,
+ (sort_param->key_length-sort_param->real_key_length));
+#endif
+ if (!wptr->pos)
+ {
+ free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+ sort_param->wordlist=0;
+ error=_ma_sort_write_record(sort_param);
+ }
+ else
+ sort_param->wordptr=(void*)wptr;
+
+ DBUG_RETURN(error);
+} /* sort_maria_ft_key_read */
+
+
+/*
+ Read next record from file using parameters in sort_info.
+
+ SYNOPSIS
+ sort_get_next_record()
+ sort_param Information about and for the sort process
+
+ NOTES
+ Dynamic Records With Non-Quick Parallel Repair
+
+ For non-quick parallel repair we use a synchronized read/write
+ cache. This means that one thread is the master who fixes the data
+ file by reading each record from the old data file and writing it
+ to the new data file. By doing this the records in the new data
+ file are written contiguously. Whenever the write buffer is full,
+ it is copied to the read buffer. The slaves read from the read
+ buffer, which is not associated with a file. Thus read_cache.file
+ is -1. When using _mi_read_cache(), the slaves must always set
+ flag to READING_NEXT so that the function never tries to read from
+ file. This is safe because the records are contiguous. There is no
+ need to read outside the cache. This condition is evaluated in the
+ variable 'parallel_flag' for quick reference. read_cache.file must
+ be >= 0 in every other case.
+
+ RETURN
+ -1 end of file
+ 0 ok
+ sort_param->filepos points to record position.
+ sort_param->record contains record
+ > 0 error
+*/
+
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
+{
+ int searching;
+ int parallel_flag;
+ uint found_record,b_type,left_length;
+ my_off_t pos;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_HA *info=sort_info->info;
+ MARIA_SHARE *share=info->s;
+ char llbuff[22],llbuff2[22];
+ DBUG_ENTER("sort_get_next_record");
+
+ if (*_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+
+ switch (sort_info->org_data_file_type) {
+ case BLOCK_RECORD:
+ {
+ for (;;)
+ {
+ int flag;
+
+ if (info != sort_info->new_info)
+ {
+ /* Safe scanning */
+ flag= _ma_safe_scan_block_record(sort_info, info,
+ sort_param->record);
+ }
+ else
+ {
+ /*
+ Scan on clean table.
+ It requires a reliable data_file_length so we set it.
+ */
+ info->state->data_file_length= sort_info->filelength;
+ flag= _ma_scan_block_record(info, sort_param->record,
+ info->cur_row.nextpos, 1);
+ }
+ if (!flag)
+ {
+ if (sort_param->calc_checksum)
+ {
+ ha_checksum checksum;
+ checksum= (*info->s->calc_check_checksum)(info, sort_param->record);
+ if (info->s->calc_checksum &&
+ info->cur_row.checksum != (checksum & 255))
+ {
+ if (param->testflag & T_VERBOSE)
+ {
+ char llbuff[22];
+ record_pos_to_txt(info, info->cur_row.lastpos, llbuff);
+ _ma_check_print_info(param,
+ "Found record with wrong checksum at %s",
+ llbuff);
+ }
+ continue;
+ }
+ info->cur_row.checksum= checksum;
+ param->glob_crc+= checksum;
+ }
+ sort_param->start_recpos= sort_param->filepos= info->cur_row.lastpos;
+ DBUG_RETURN(0);
+ }
+ if (flag == HA_ERR_END_OF_FILE)
+ {
+ sort_param->max_pos= sort_info->filelength;
+ DBUG_RETURN(-1);
+ }
+ /* Retry only if wrong record, not if disk error */
+ if (flag != HA_ERR_WRONG_IN_RECORD)
+ DBUG_RETURN(flag);
+ }
+ break;
+ }
+ case STATIC_RECORD:
+ for (;;)
+ {
+ if (my_b_read(&sort_param->read_cache,sort_param->record,
+ share->base.pack_reclength))
+ {
+ if (sort_param->read_cache.error)
+ param->out_flag |= O_DATA_LOST;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(-1);
+ }
+ sort_param->start_recpos=sort_param->pos;
+ if (!sort_param->fix_datafile)
+ {
+ sort_param->filepos=sort_param->pos;
+ if (sort_param->master)
+ share->state.split++;
+ }
+ sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength);
+ if (*sort_param->record)
+ {
+ if (sort_param->calc_checksum)
+ param->glob_crc+= (info->cur_row.checksum=
+ _ma_static_checksum(info,sort_param->record));
+ DBUG_RETURN(0);
+ }
+ if (!sort_param->fix_datafile && sort_param->master)
+ {
+ info->state->del++;
+ info->state->empty+=share->base.pack_reclength;
+ }
+ }
+ case DYNAMIC_RECORD:
+ {
+ uchar *to;
+ LINT_INIT(to);
+ ha_checksum checksum= 0;
+
+ pos=sort_param->pos;
+ searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND));
+ parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0;
+ for (;;)
+ {
+ found_record=block_info.second_read= 0;
+ left_length=1;
+ if (searching)
+ {
+ pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE);
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ sort_param->start_recpos=pos;
+ }
+ do
+ {
+ if (pos > sort_param->max_pos)
+ sort_param->max_pos=pos;
+ if (pos & (MARIA_DYN_ALIGN_SIZE-1))
+ {
+ if ((param->testflag & T_VERBOSE) || searching == 0)
+ _ma_check_print_info(param,"Wrong aligned block at %s",
+ llstr(pos,llbuff));
+ if (searching)
+ goto try_next;
+ }
+ if (found_record && pos == param->search_after_block)
+ _ma_check_print_info(param,"Block: %s used by record at %s",
+ llstr(param->search_after_block,llbuff),
+ llstr(sort_param->start_recpos,llbuff2));
+ if (_ma_read_cache(&sort_param->read_cache,
+ (uchar*) block_info.header,pos,
+ MARIA_BLOCK_INFO_HEADER_LENGTH,
+ (! found_record ? READING_NEXT : 0) |
+ parallel_flag | READING_HEADER))
+ {
+ if (found_record)
+ {
+ _ma_check_print_info(param,
+ "Can't read whole record at %s (errno: %d)",
+ llstr(sort_param->start_recpos,llbuff),errno);
+ goto try_next;
+ }
+ DBUG_RETURN(-1);
+ }
+ if (searching && ! sort_param->fix_datafile)
+ {
+ param->error_printed=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1); /* Something wrong with data */
+ }
+ b_type= _ma_get_block_info(&block_info,-1,pos);
+ if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) ||
+ ((b_type & BLOCK_FIRST) &&
+ (block_info.rec_len < (uint) share->base.min_pack_length ||
+ block_info.rec_len > (uint) share->base.max_pack_length)))
+ {
+ uint i;
+ if (param->testflag & T_VERBOSE || searching == 0)
+ _ma_check_print_info(param,
+ "Wrong bytesec: %3d-%3d-%3d at %10s; Skipped",
+ block_info.header[0],block_info.header[1],
+ block_info.header[2],llstr(pos,llbuff));
+ if (found_record)
+ goto try_next;
+ block_info.second_read=0;
+ searching=1;
+ /* Search after block in read header string */
+ for (i=MARIA_DYN_ALIGN_SIZE ;
+ i < MARIA_BLOCK_INFO_HEADER_LENGTH ;
+ i+= MARIA_DYN_ALIGN_SIZE)
+ if (block_info.header[i] >= 1 &&
+ block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE)
+ break;
+ pos+=(ulong) i;
+ sort_param->start_recpos=pos;
+ continue;
+ }
+ if (b_type & BLOCK_DELETED)
+ {
+ bool error=0;
+ if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+ share->base.min_block_length)
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Deleted block with impossible length %u at %s",
+ block_info.block_len,llstr(pos,llbuff));
+ error=1;
+ }
+ else
+ {
+ if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+ block_info.next_filepos >=
+ info->state->data_file_length) ||
+ (block_info.prev_filepos != HA_OFFSET_ERROR &&
+ block_info.prev_filepos >= info->state->data_file_length))
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Delete link points outside datafile at %s",
+ llstr(pos,llbuff));
+ error=1;
+ }
+ }
+ if (error)
+ {
+ if (found_record)
+ goto try_next;
+ searching=1;
+ pos+= MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ block_info.second_read=0;
+ continue;
+ }
+ }
+ else
+ {
+ if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+ share->base.min_block_length ||
+ block_info.block_len > (uint) share->base.max_pack_length+
+ MARIA_SPLIT_LENGTH)
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Found block with impossible length %u at %s; Skipped",
+ block_info.block_len+ (uint) (block_info.filepos-pos),
+ llstr(pos,llbuff));
+ if (found_record)
+ goto try_next;
+ searching=1;
+ pos+= MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ block_info.second_read=0;
+ continue;
+ }
+ }
+ if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ {
+ if (!sort_param->fix_datafile && sort_param->master &&
+ (b_type & BLOCK_DELETED))
+ {
+ info->state->empty+=block_info.block_len;
+ info->state->del++;
+ share->state.split++;
+ }
+ if (found_record)
+ goto try_next;
+ if (searching)
+ {
+ pos+=MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ }
+ else
+ pos=block_info.filepos+block_info.block_len;
+ block_info.second_read=0;
+ continue;
+ }
+
+ if (!sort_param->fix_datafile && sort_param->master)
+ share->state.split++;
+ if (! found_record++)
+ {
+ sort_param->find_length=left_length=block_info.rec_len;
+ sort_param->start_recpos=pos;
+ if (!sort_param->fix_datafile)
+ sort_param->filepos=sort_param->start_recpos;
+ if (sort_param->fix_datafile && (param->testflag & T_EXTEND))
+ sort_param->pos=block_info.filepos+1;
+ else
+ sort_param->pos=block_info.filepos+block_info.block_len;
+ if (share->base.blobs)
+ {
+ if (_ma_alloc_buffer(&sort_param->rec_buff,
+ &sort_param->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+
+ {
+ if (param->max_record_length >= block_info.rec_len)
+ {
+ _ma_check_print_error(param,"Not enough memory for blob at %s (need %lu)",
+ llstr(sort_param->start_recpos,llbuff),
+ (ulong) block_info.rec_len);
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ _ma_check_print_info(param,"Not enough memory for blob at %s (need %lu); Row skipped",
+ llstr(sort_param->start_recpos,llbuff),
+ (ulong) block_info.rec_len);
+ goto try_next;
+ }
+ }
+ }
+ to= sort_param->rec_buff;
+ }
+ if (left_length < block_info.data_len || ! block_info.data_len)
+ {
+ _ma_check_print_info(param,
+ "Found block with too small length at %s; "
+ "Skipped",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ if (block_info.filepos + block_info.data_len >
+ sort_param->read_cache.end_of_file)
+ {
+ _ma_check_print_info(param,
+ "Found block that points outside data file "
+ "at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ /*
+ Copy information that is already read. Avoid accessing data
+ below the cache start. This could happen if the header
+ streched over the end of the previous buffer contents.
+ */
+ {
+ uint header_len= (uint) (block_info.filepos - pos);
+ uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len);
+
+ if (prefetch_len > block_info.data_len)
+ prefetch_len= block_info.data_len;
+ if (prefetch_len)
+ {
+ memcpy(to, block_info.header + header_len, prefetch_len);
+ block_info.filepos+= prefetch_len;
+ block_info.data_len-= prefetch_len;
+ left_length-= prefetch_len;
+ to+= prefetch_len;
+ }
+ }
+ if (block_info.data_len &&
+ _ma_read_cache(&sort_param->read_cache,to,block_info.filepos,
+ block_info.data_len,
+ (found_record == 1 ? READING_NEXT : 0) |
+ parallel_flag))
+ {
+ _ma_check_print_info(param,
+ "Read error for block at: %s (error: %d); Skipped",
+ llstr(block_info.filepos,llbuff),my_errno);
+ goto try_next;
+ }
+ left_length-=block_info.data_len;
+ to+=block_info.data_len;
+ pos=block_info.next_filepos;
+ if (pos == HA_OFFSET_ERROR && left_length)
+ {
+ _ma_check_print_info(param,"Wrong block with wrong total length starting at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH > sort_param->read_cache.end_of_file)
+ {
+ _ma_check_print_info(param,"Found link that points at %s (outside data file) at %s",
+ llstr(pos,llbuff2),
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ } while (left_length);
+
+ if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff,
+ sort_param->find_length) != MY_FILE_ERROR)
+ {
+ if (sort_param->read_cache.error < 0)
+ DBUG_RETURN(1);
+ if (sort_param->calc_checksum)
+ checksum= (info->s->calc_check_checksum)(info, sort_param->record);
+ if ((param->testflag & (T_EXTEND | T_REP)) || searching)
+ {
+ if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff,
+ sort_param->find_length,
+ (param->testflag & T_QUICK) &&
+ sort_param->calc_checksum &&
+ test(info->s->calc_checksum), checksum))
+ {
+ _ma_check_print_info(param,"Found wrong packed record at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ }
+ if (sort_param->calc_checksum)
+ param->glob_crc+= checksum;
+ DBUG_RETURN(0);
+ }
+ if (!searching)
+ _ma_check_print_info(param,"Key %d - Found wrong stored record at %s",
+ sort_param->key+1,
+ llstr(sort_param->start_recpos,llbuff));
+ try_next:
+ pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE);
+ searching=1;
+ }
+ }
+ case COMPRESSED_RECORD:
+ for (searching=0 ;; searching=1, sort_param->pos++)
+ {
+ if (_ma_read_cache(&sort_param->read_cache,(uchar*) block_info.header,
+ sort_param->pos,
+ share->pack.ref_length,READING_NEXT))
+ DBUG_RETURN(-1);
+ if (searching && ! sort_param->fix_datafile)
+ {
+ param->error_printed=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1); /* Something wrong with data */
+ }
+ sort_param->start_recpos=sort_param->pos;
+ if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info,
+ &sort_param->rec_buff,
+ &sort_param->rec_buff_size, -1,
+ sort_param->pos))
+ DBUG_RETURN(-1);
+ if (!block_info.rec_len &&
+ sort_param->pos + MEMMAP_EXTRA_MARGIN ==
+ sort_param->read_cache.end_of_file)
+ DBUG_RETURN(-1);
+ if (block_info.rec_len < (uint) share->min_pack_length ||
+ block_info.rec_len > (uint) share->max_pack_length)
+ {
+ if (! searching)
+ _ma_check_print_info(param,"Found block with wrong recordlength: %d at %s\n",
+ block_info.rec_len,
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+ if (_ma_read_cache(&sort_param->read_cache,(uchar*) sort_param->rec_buff,
+ block_info.filepos, block_info.rec_len,
+ READING_NEXT))
+ {
+ if (! searching)
+ _ma_check_print_info(param,"Couldn't read whole record from %s",
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+ if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record,
+ sort_param->rec_buff, block_info.rec_len))
+ {
+ if (! searching)
+ _ma_check_print_info(param,"Found wrong record at %s",
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+ if (!sort_param->fix_datafile)
+ {
+ sort_param->filepos=sort_param->pos;
+ if (sort_param->master)
+ share->state.split++;
+ }
+ sort_param->max_pos=(sort_param->pos=block_info.filepos+
+ block_info.rec_len);
+ info->packed_length=block_info.rec_len;
+
+ if (sort_param->calc_checksum)
+ {
+ info->cur_row.checksum= (*info->s->calc_check_checksum)(info,
+ sort_param->
+ record);
+ param->glob_crc+= info->cur_row.checksum;
+ }
+ DBUG_RETURN(0);
+ }
+ }
+ DBUG_RETURN(1); /* Impossible */
+}
+
+
+/*
+ Write record to new file.
+
+ SYNOPSIS
+ _ma_sort_write_record()
+ sort_param Sort parameters.
+
+ NOTE
+ This is only called by a master thread if parallel repair is used.
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param)
+{
+ int flag;
+ uint length;
+ ulong block_length,reclength;
+ uchar *from;
+ uchar block_buff[8];
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param= sort_info->param;
+ MARIA_HA *info= sort_info->new_info;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_sort_write_record");
+
+ if (sort_param->fix_datafile)
+ {
+ switch (sort_info->new_data_file_type) {
+ case BLOCK_RECORD:
+ if ((sort_param->filepos= (*share->write_record_init)(info,
+ sort_param->
+ record)) ==
+ HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+ break;
+ case STATIC_RECORD:
+ if (my_b_write(&info->rec_cache,sort_param->record,
+ share->base.pack_reclength))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=share->base.pack_reclength;
+ info->s->state.split++;
+ break;
+ case DYNAMIC_RECORD:
+ if (! info->blobs)
+ from=sort_param->rec_buff;
+ else
+ {
+ /* must be sure that local buffer is big enough */
+ reclength=info->s->base.pack_reclength+
+ _ma_calc_total_blob_length(info,sort_param->record)+
+ ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER;
+ if (sort_info->buff_length < reclength)
+ {
+ if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength,
+ MYF(MY_FREE_ON_ERROR |
+ MY_ALLOW_ZERO_PTR))))
+ DBUG_RETURN(1);
+ sort_info->buff_length=reclength;
+ }
+ from=sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER);
+ }
+ /* We can use info->checksum here as only one thread calls this */
+ info->cur_row.checksum= (*info->s->calc_check_checksum)(info,
+ sort_param->
+ record);
+ reclength= _ma_rec_pack(info,from,sort_param->record);
+ flag=0;
+
+ do
+ {
+ block_length=reclength+ 3 + test(reclength >= (65520-3));
+ if (block_length < share->base.min_block_length)
+ block_length=share->base.min_block_length;
+ info->update|=HA_STATE_WRITE_AT_END;
+ block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE);
+ if (block_length > MARIA_MAX_BLOCK_LENGTH)
+ block_length=MARIA_MAX_BLOCK_LENGTH;
+ if (_ma_write_part_record(info,0L,block_length,
+ sort_param->filepos+block_length,
+ &from,&reclength,&flag))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=block_length;
+ info->s->state.split++;
+ } while (reclength);
+ break;
+ case COMPRESSED_RECORD:
+ reclength=info->packed_length;
+ length= _ma_save_pack_length((uint) share->pack.version, block_buff,
+ reclength);
+ if (info->s->base.blobs)
+ length+= _ma_save_pack_length((uint) share->pack.version,
+ block_buff + length, info->blob_length);
+ if (my_b_write(&info->rec_cache,block_buff,length) ||
+ my_b_write(&info->rec_cache,(uchar*) sort_param->rec_buff,reclength))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=reclength+length;
+ info->s->state.split++;
+ break;
+ }
+ }
+ if (sort_param->master)
+ {
+ info->state->records++;
+ if ((param->testflag & T_WRITE_LOOP) &&
+ (info->state->records % WRITE_COUNT) == 0)
+ {
+ char llbuff[22];
+ printf("%s\r", llstr(info->state->records,llbuff));
+ VOID(fflush(stdout));
+ }
+ }
+ DBUG_RETURN(0);
+} /* _ma_sort_write_record */
+
+
+/* Compare two keys from _ma_create_index_by_sort */
+
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+ const void *b)
+{
+ uint not_used[2];
+ return (ha_key_cmp(sort_param->seg, *((uchar**) a), *((uchar**) b),
+ USE_WHOLE_KEY, SEARCH_SAME, not_used));
+} /* sort_key_cmp */
+
+
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
+{
+ uint diff_pos[2];
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param= sort_info->param;
+ int cmp;
+
+ if (sort_info->key_block->inited)
+ {
+ cmp=ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
+ (uchar*) a, USE_WHOLE_KEY,SEARCH_FIND | SEARCH_UPDATE,
+ diff_pos);
+ if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+ ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
+ (uchar*) a, USE_WHOLE_KEY,
+ SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos);
+ else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ {
+ diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg,
+ sort_param->notnull,
+ sort_info->key_block->lastkey,
+ a);
+ }
+ sort_param->unique[diff_pos[0]-1]++;
+ }
+ else
+ {
+ cmp= -1;
+ if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull,
+ a);
+ }
+ if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0)
+ {
+ sort_info->dupp++;
+ sort_info->info->cur_row.lastpos= get_record_for_key(sort_info->info,
+ sort_param->keyinfo,
+ a);
+ _ma_check_print_warning(param,
+ "Duplicate key for record at %10s against record at %10s",
+ llstr(sort_info->info->cur_row.lastpos, llbuff),
+ llstr(get_record_for_key(sort_info->info,
+ sort_param->keyinfo,
+ sort_info->key_block->
+ lastkey),
+ llbuff2));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ if (sort_info->param->testflag & T_VERBOSE)
+ _ma_print_key(stdout,sort_param->seg, a, USE_WHOLE_KEY);
+ return (sort_delete_record(sort_param));
+ }
+#ifndef DBUG_OFF
+ if (cmp > 0)
+ {
+ _ma_check_print_error(param,
+ "Internal error: Keys are not in order from sort");
+ return(1);
+ }
+#endif
+ return (sort_insert_key(sort_param, sort_info->key_block,
+ a, HA_OFFSET_ERROR));
+} /* sort_key_write */
+
+
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param)
+{
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ SORT_KEY_BLOCKS *key_block=sort_info->key_block;
+ MARIA_SHARE *share=sort_info->info->s;
+ uint val_off, val_len;
+ int error;
+ SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf;
+ uchar *from, *to;
+
+ val_len=share->ft2_keyinfo.keylength;
+ get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey);
+ to= maria_ft_buf->lastkey+val_off;
+
+ if (maria_ft_buf->buf)
+ {
+ /* flushing first-level tree */
+ error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+ HA_OFFSET_ERROR);
+ for (from=to+val_len;
+ !error && from < maria_ft_buf->buf;
+ from+= val_len)
+ {
+ memcpy(to, from, val_len);
+ error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+ HA_OFFSET_ERROR);
+ }
+ return error;
+ }
+ /* flushing second-level tree keyblocks */
+ error=_ma_flush_pending_blocks(sort_param);
+ /* updating lastkey with second-level tree info */
+ ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count);
+ _ma_dpointer(sort_info->info, maria_ft_buf->lastkey+val_off+HA_FT_WLEN,
+ share->state.key_root[sort_param->key]);
+ /* restoring first level tree data in sort_info/sort_param */
+ sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks;
+ sort_param->keyinfo=share->keyinfo+sort_param->key;
+ share->state.key_root[sort_param->key]=HA_OFFSET_ERROR;
+ /* writing lastkey in first-level tree */
+ return error ? error :
+ sort_insert_key(sort_param,sort_info->key_block,
+ maria_ft_buf->lastkey,HA_OFFSET_ERROR);
+}
+
+
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+ const uchar *a)
+{
+ uint a_len, val_off, val_len, error;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ SORT_FT_BUF *ft_buf= sort_info->ft_buf;
+ SORT_KEY_BLOCKS *key_block= sort_info->key_block;
+
+ val_len=HA_FT_WLEN+sort_info->info->s->base.rec_reflength;
+ get_key_full_length_rdonly(a_len, (uchar *)a);
+
+ if (!ft_buf)
+ {
+ /*
+ use two-level tree only if key_reflength fits in rec_reflength place
+ and row format is NOT static - for _ma_dpointer not to garble offsets
+ */
+ if ((sort_info->info->s->base.key_reflength <=
+ sort_info->info->s->base.rec_reflength) &&
+ (sort_info->info->s->options &
+ (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)))
+ ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length +
+ sizeof(SORT_FT_BUF), MYF(MY_WME));
+
+ if (!ft_buf)
+ {
+ sort_param->key_write=sort_key_write;
+ return sort_key_write(sort_param, a);
+ }
+ sort_info->ft_buf= ft_buf;
+ goto word_init_ft_buf; /* no need to duplicate the code */
+ }
+ get_key_full_length_rdonly(val_off, ft_buf->lastkey);
+
+ if (ha_compare_text(sort_param->seg->charset,
+ ((uchar *)a)+1,a_len-1,
+ (uchar*) ft_buf->lastkey+1,val_off-1, 0, 0)==0)
+ {
+ uchar *p;
+ if (!ft_buf->buf) /* store in second-level tree */
+ {
+ ft_buf->count++;
+ return sort_insert_key(sort_param,key_block,
+ a + a_len, HA_OFFSET_ERROR);
+ }
+
+ /* storing the key in the buffer. */
+ memcpy (ft_buf->buf, (char *)a+a_len, val_len);
+ ft_buf->buf+=val_len;
+ if (ft_buf->buf < ft_buf->end)
+ return 0;
+
+ /* converting to two-level tree */
+ p=ft_buf->lastkey+val_off;
+
+ while (key_block->inited)
+ key_block++;
+ sort_info->key_block=key_block;
+ sort_param->keyinfo=& sort_info->info->s->ft2_keyinfo;
+ ft_buf->count=(ft_buf->buf - p)/val_len;
+
+ /* flushing buffer to second-level tree */
+ for (error=0; !error && p < ft_buf->buf; p+= val_len)
+ error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR);
+ ft_buf->buf=0;
+ return error;
+ }
+
+ /* flushing buffer */
+ if ((error=_ma_sort_ft_buf_flush(sort_param)))
+ return error;
+
+word_init_ft_buf:
+ a_len+=val_len;
+ memcpy(ft_buf->lastkey, a, a_len);
+ ft_buf->buf=ft_buf->lastkey+a_len;
+ /*
+ 32 is just a safety margin here
+ (at least max(val_len, sizeof(nod_flag)) should be there).
+ May be better performance could be achieved if we'd put
+ (sort_info->keyinfo->block_length-32)/XXX
+ instead.
+ TODO: benchmark the best value for XXX.
+ */
+ ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32);
+ return 0;
+} /* sort_maria_ft_key_write */
+
+
+ /* get pointer to record from a key */
+
+static my_off_t get_record_for_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ const uchar *key)
+{
+ return _ma_dpos(info,0, key + _ma_keylength(keyinfo, key));
+} /* get_record_for_key */
+
+
+ /* Insert a key in sort-key-blocks */
+
+static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
+ register SORT_KEY_BLOCKS *key_block,
+ const uchar *key,
+ my_off_t prev_block)
+{
+ uint a_length,t_length,nod_flag;
+ my_off_t filepos,key_file_length;
+ uchar *anc_buff,*lastkey;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_HA *info;
+ MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ DBUG_ENTER("sort_insert_key");
+
+ anc_buff= key_block->buff;
+ info=sort_info->info;
+ lastkey=key_block->lastkey;
+ nod_flag= (key_block == sort_info->key_block ? 0 :
+ info->s->base.key_reflength);
+
+ if (!key_block->inited)
+ {
+ key_block->inited=1;
+ if (key_block == sort_info->key_block_end)
+ {
+ _ma_check_print_error(param,"To many key-block-levels; Try increasing sort_key_blocks");
+ DBUG_RETURN(1);
+ }
+ a_length=2+nod_flag;
+ key_block->end_pos=anc_buff+2;
+ lastkey=0; /* No previous key in block */
+ }
+ else
+ a_length= maria_data_on_page(anc_buff);
+
+ /* Save pointer to previous block */
+ if (nod_flag)
+ _ma_kpointer(info,key_block->end_pos,prev_block);
+
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,
+ (uchar*) 0,lastkey,lastkey,key,
+ &s_temp);
+ (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp);
+ a_length+=t_length;
+ maria_putint(anc_buff,a_length,nod_flag);
+ key_block->end_pos+=t_length;
+ if (a_length <= keyinfo->block_length)
+ {
+ VOID(_ma_move_key(keyinfo, key_block->lastkey, key));
+ key_block->last_length=a_length-t_length;
+ DBUG_RETURN(0);
+ }
+
+ /* Fill block with end-zero and write filled block */
+ maria_putint(anc_buff,key_block->last_length,nod_flag);
+ bzero(anc_buff+key_block->last_length,
+ keyinfo->block_length- key_block->last_length);
+ key_file_length=info->state->key_file_length;
+ if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+
+ /* If we read the page from the key cache, we have to write it back to it */
+ if (key_file_length == info->state->key_file_length)
+ {
+ if (_ma_write_keypage(info, keyinfo, filepos, DFLT_INIT_HITS, anc_buff))
+ DBUG_RETURN(1);
+ }
+ else if (my_pwrite(info->s->kfile.file, anc_buff,
+ (uint) keyinfo->block_length,filepos, param->myf_rw))
+ DBUG_RETURN(1);
+ DBUG_DUMP("buff",anc_buff,maria_data_on_page(anc_buff));
+
+ /* Write separator-key to block in next level */
+ if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos))
+ DBUG_RETURN(1);
+
+ /* clear old block and write new key in it */
+ key_block->inited=0;
+ DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block));
+} /* sort_insert_key */
+
+
+ /* Delete record when we found a duplicated key */
+
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param)
+{
+ uint i;
+ int old_file,error;
+ uchar *key;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_HA *info=sort_info->info;
+ DBUG_ENTER("sort_delete_record");
+
+ if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+ {
+ _ma_check_print_error(param,
+ "Quick-recover aborted; Run recovery without switch -q or with switch -qq");
+ DBUG_RETURN(1);
+ }
+ if (info->s->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ _ma_check_print_error(param,
+ "Recover aborted; Can't run standard recovery on compressed tables with errors in data-file. Use switch 'maria_chk --safe-recover' to fix it\n",stderr);;
+ DBUG_RETURN(1);
+ }
+
+ old_file= info->dfile.file;
+ info->dfile.file= info->rec_cache.file;
+ if (sort_info->current_key)
+ {
+ key= info->lastkey+info->s->base.max_key_length;
+ if ((error=(*info->s->read_record)(info,sort_param->record,
+ info->cur_row.lastpos)) &&
+ error != HA_ERR_RECORD_DELETED)
+ {
+ _ma_check_print_error(param,"Can't read record to be removed");
+ info->dfile.file= old_file;
+ DBUG_RETURN(1);
+ }
+
+ for (i=0 ; i < sort_info->current_key ; i++)
+ {
+ uint key_length= _ma_make_key(info, i, key, sort_param->record,
+ info->cur_row.lastpos);
+ if (_ma_ck_delete(info, i, key, key_length))
+ {
+ _ma_check_print_error(param,
+ "Can't delete key %d from record to be removed",
+ i+1);
+ info->dfile.file= old_file;
+ DBUG_RETURN(1);
+ }
+ }
+ if (sort_param->calc_checksum)
+ param->glob_crc-=(*info->s->calc_check_checksum)(info,
+ sort_param->record);
+ }
+ error= (flush_io_cache(&info->rec_cache) ||
+ (*info->s->delete_record)(info, sort_param->record));
+ info->dfile.file= old_file; /* restore actual value */
+ info->state->records--;
+ DBUG_RETURN(error);
+} /* sort_delete_record */
+
+
+/* Fix all pending blocks and flush everything to disk */
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param)
+{
+ uint nod_flag,length;
+ my_off_t filepos,key_file_length;
+ SORT_KEY_BLOCKS *key_block;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ myf myf_rw=sort_info->param->myf_rw;
+ MARIA_HA *info=sort_info->info;
+ MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+ DBUG_ENTER("_ma_flush_pending_blocks");
+
+ filepos= HA_OFFSET_ERROR; /* if empty file */
+ nod_flag=0;
+ for (key_block=sort_info->key_block ; key_block->inited ; key_block++)
+ {
+ key_block->inited=0;
+ length= maria_data_on_page(key_block->buff);
+ if (nod_flag)
+ _ma_kpointer(info,key_block->end_pos,filepos);
+ key_file_length=info->state->key_file_length;
+ bzero(key_block->buff+length, keyinfo->block_length-length);
+ if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+
+ /* If we read the page from the key cache, we have to write it back */
+ if (key_file_length == info->state->key_file_length)
+ {
+ if (_ma_write_keypage(info, keyinfo, filepos,
+ DFLT_INIT_HITS, key_block->buff))
+ DBUG_RETURN(1);
+ }
+ else if (my_pwrite(info->s->kfile.file, key_block->buff,
+ (uint) keyinfo->block_length,filepos, myf_rw))
+ DBUG_RETURN(1);
+ DBUG_DUMP("buff",key_block->buff,length);
+ nod_flag=1;
+ }
+ info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */
+ DBUG_RETURN(0);
+} /* _ma_flush_pending_blocks */
+
+ /* alloc space and pointers for key_blocks */
+
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+ uint buffer_length)
+{
+ reg1 uint i;
+ SORT_KEY_BLOCKS *block;
+ DBUG_ENTER("alloc_key_blocks");
+
+ if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+
+ buffer_length+IO_SIZE)*blocks,
+ MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for sort-key-blocks");
+ return(0);
+ }
+ for (i=0 ; i < blocks ; i++)
+ {
+ block[i].inited=0;
+ block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i;
+ }
+ DBUG_RETURN(block);
+} /* alloc_key_blocks */
+
+
+ /* Check if file is almost full */
+
+int maria_test_if_almost_full(MARIA_HA *info)
+{
+ if (info->s->options & HA_OPTION_COMPRESS_RECORD)
+ return 0;
+ return my_seek(info->s->kfile.file, 0L, MY_SEEK_END,
+ MYF(MY_THREADSAFE))/10*9 >
+ (my_off_t) info->s->base.max_key_file_length ||
+ my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 >
+ (my_off_t) info->s->base.max_data_file_length;
+}
+
+ /* Recreate table with bigger more alloced record-data */
+
+int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename)
+{
+ int error;
+ MARIA_HA info;
+ MARIA_SHARE share;
+ MARIA_KEYDEF *keyinfo,*key,*key_end;
+ HA_KEYSEG *keysegs,*keyseg;
+ MARIA_COLUMNDEF *columndef,*column,*end;
+ MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end;
+ MARIA_STATUS_INFO status_info;
+ uint unpack,key_parts;
+ ha_rows max_records;
+ ulonglong file_length,tmp_length;
+ MARIA_CREATE_INFO create_info;
+ DBUG_ENTER("maria_recreate_table");
+
+ error=1; /* Default error */
+ info= **org_info;
+ status_info= (*org_info)->state[0];
+ info.state= &status_info;
+ share= *(*org_info)->s;
+ unpack= (share.options & HA_OPTION_COMPRESS_RECORD) &&
+ (param->testflag & T_UNPACK);
+ if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) *
+ share.base.keys)))
+ DBUG_RETURN(0);
+ memcpy((uchar*) keyinfo,(uchar*) share.keyinfo,
+ (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys));
+
+ key_parts= share.base.all_key_parts;
+ if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)*
+ (key_parts+share.base.keys))))
+ {
+ my_afree((uchar*) keyinfo);
+ DBUG_RETURN(1);
+ }
+ if (!(columndef=(MARIA_COLUMNDEF*)
+ my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1))))
+ {
+ my_afree((uchar*) keyinfo);
+ my_afree((uchar*) keysegs);
+ DBUG_RETURN(1);
+ }
+ if (!(uniquedef=(MARIA_UNIQUEDEF*)
+ my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1))))
+ {
+ my_afree((uchar*) columndef);
+ my_afree((uchar*) keyinfo);
+ my_afree((uchar*) keysegs);
+ DBUG_RETURN(1);
+ }
+
+ /* Copy the column definitions */
+ memcpy((uchar*) columndef,(uchar*) share.columndef,
+ (size_t) (sizeof(MARIA_COLUMNDEF)*(share.base.fields+1)));
+ for (column=columndef, end= columndef+share.base.fields;
+ column != end ;
+ column++)
+ {
+ if (unpack && !(share.options & HA_OPTION_PACK_RECORD) &&
+ column->type != FIELD_BLOB &&
+ column->type != FIELD_VARCHAR &&
+ column->type != FIELD_CHECK)
+ column->type=(int) FIELD_NORMAL;
+ }
+
+ /* Change the new key to point at the saved key segments */
+ memcpy((uchar*) keysegs,(uchar*) share.keyparts,
+ (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+
+ share.state.header.uniques)));
+ keyseg=keysegs;
+ for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++)
+ {
+ key->seg=keyseg;
+ for (; keyseg->type ; keyseg++)
+ {
+ if (param->language)
+ keyseg->language=param->language; /* change language */
+ }
+ keyseg++; /* Skip end pointer */
+ }
+
+ /*
+ Copy the unique definitions and change them to point at the new key
+ segments
+ */
+ memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo,
+ (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques)));
+ for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques;
+ u_ptr != u_end ; u_ptr++)
+ {
+ u_ptr->seg=keyseg;
+ keyseg+=u_ptr->keysegs+1;
+ }
+ if (share.options & HA_OPTION_COMPRESS_RECORD)
+ share.base.records=max_records=info.state->records;
+ else if (share.base.min_pack_length)
+ max_records=(ha_rows) (my_seek(info.dfile.file, 0L, MY_SEEK_END,
+ MYF(0)) /
+ (ulong) share.base.min_pack_length);
+ else
+ max_records=0;
+ unpack= (share.data_file_type == COMPRESSED_RECORD) &&
+ (param->testflag & T_UNPACK);
+ share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD;
+
+ file_length=(ulonglong) my_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0));
+ tmp_length= file_length+file_length/10;
+ set_if_bigger(file_length,param->max_data_file_length);
+ set_if_bigger(file_length,tmp_length);
+ set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length);
+
+ VOID(maria_close(*org_info));
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=max(max_records,share.base.records);
+ create_info.reloc_rows=share.base.reloc;
+ create_info.old_options=(share.options |
+ (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0));
+
+ create_info.data_file_length=file_length;
+ create_info.auto_increment=share.state.auto_increment;
+ create_info.language = (param->language ? param->language :
+ share.state.header.language);
+ create_info.key_file_length= status_info.key_file_length;
+ create_info.org_data_file_type= ((enum data_file_type)
+ share.state.header.org_data_file_type);
+
+ /*
+ Allow for creating an auto_increment key. This has an effect only if
+ an auto_increment key exists in the original table.
+ */
+ create_info.with_auto_increment= TRUE;
+ create_info.null_bytes= share.base.null_bytes;
+ /*
+ We don't have to handle symlinks here because we are using
+ HA_DONT_TOUCH_DATA
+ */
+ if (maria_create(filename, share.data_file_type,
+ share.base.keys - share.state.header.uniques,
+ keyinfo, share.base.fields, columndef,
+ share.state.header.uniques, uniquedef,
+ &create_info,
+ HA_DONT_TOUCH_DATA))
+ {
+ _ma_check_print_error(param,
+ "Got error %d when trying to recreate indexfile",
+ my_errno);
+ goto end;
+ }
+ *org_info=maria_open(filename,O_RDWR,
+ (param->testflag & T_WAIT_FOREVER) ? HA_OPEN_WAIT_IF_LOCKED :
+ (param->testflag & T_DESCRIPT) ? HA_OPEN_IGNORE_IF_LOCKED :
+ HA_OPEN_ABORT_IF_LOCKED);
+ if (!*org_info)
+ {
+ _ma_check_print_error(param,
+ "Got error %d when trying to open re-created indexfile",
+ my_errno);
+ goto end;
+ }
+ /* We are modifing */
+ (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA;
+ VOID(_ma_readinfo(*org_info,F_WRLCK,0));
+ (*org_info)->state->records=info.state->records;
+ if (share.state.create_time)
+ (*org_info)->s->state.create_time=share.state.create_time;
+ (*org_info)->s->state.unique=(*org_info)->this_unique=
+ share.state.unique;
+ (*org_info)->state->checksum=info.state->checksum;
+ (*org_info)->state->del=info.state->del;
+ (*org_info)->s->state.dellink=share.state.dellink;
+ (*org_info)->state->empty=info.state->empty;
+ (*org_info)->state->data_file_length=info.state->data_file_length;
+ if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT |
+ UPDATE_OPEN_COUNT))
+ goto end;
+ error=0;
+end:
+ my_afree((uchar*) uniquedef);
+ my_afree((uchar*) keyinfo);
+ my_afree((uchar*) columndef);
+ my_afree((uchar*) keysegs);
+ DBUG_RETURN(error);
+}
+
+
+ /* write suffix to data file if neaded */
+
+int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile)
+{
+ MARIA_HA *info=sort_info->new_info;
+
+ if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile)
+ {
+ char buff[MEMMAP_EXTRA_MARGIN];
+ bzero(buff,sizeof(buff));
+ if (my_b_write(&info->rec_cache,buff,sizeof(buff)))
+ {
+ _ma_check_print_error(sort_info->param,
+ "%d when writing to datafile",my_errno);
+ return 1;
+ }
+ sort_info->param->read_cache.end_of_file+=sizeof(buff);
+ }
+ return 0;
+}
+
+
+/* Update state and maria_chk time of indexfile */
+
+int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update)
+{
+ MARIA_SHARE *share=info->s;
+
+ if (update & UPDATE_OPEN_COUNT)
+ {
+ share->state.open_count=0;
+ share->global_changed=0;
+ }
+ if (update & UPDATE_STAT)
+ {
+ uint i, key_parts= mi_uint2korr(share->state.header.key_parts);
+ share->state.rec_per_key_rows=info->state->records;
+ share->state.changed&= ~STATE_NOT_ANALYZED;
+ if (info->state->records)
+ {
+ for (i=0; i<key_parts; i++)
+ {
+ if (!(share->state.rec_per_key_part[i]=param->rec_per_key_part[i]))
+ share->state.changed|= STATE_NOT_ANALYZED;
+ }
+ }
+ }
+ if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC))
+ {
+ if (update & UPDATE_TIME)
+ {
+ share->state.check_time= (long) time((time_t*) 0);
+ if (!share->state.create_time)
+ share->state.create_time=share->state.check_time;
+ }
+ /*
+ When tables are locked we haven't synched the share state and the
+ real state for a while so we better do it here before synching
+ the share state to disk. Only when table is write locked is it
+ necessary to perform this synch.
+ */
+ if (info->lock_type == F_WRLCK)
+ share->state.state= *info->state;
+ if (_ma_state_info_write(share, 1|2))
+ goto err;
+ share->changed=0;
+ }
+ { /* Force update of status */
+ int error;
+ uint r_locks=share->r_locks,w_locks=share->w_locks;
+ share->r_locks= share->w_locks= share->tot_locks= 0;
+ error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK);
+ share->r_locks=r_locks;
+ share->w_locks=w_locks;
+ share->tot_locks=r_locks+w_locks;
+ if (!error)
+ return 0;
+ }
+err:
+ _ma_check_print_error(param,"%d when updating keyfile",my_errno);
+ return 1;
+}
+
+ /*
+ Update auto increment value for a table
+ When setting the 'repair_only' flag we only want to change the
+ old auto_increment value if its wrong (smaller than some given key).
+ The reason is that we shouldn't change the auto_increment value
+ for a table without good reason when only doing a repair; If the
+ user have inserted and deleted rows, the auto_increment value
+ may be bigger than the biggest current row and this is ok.
+
+ If repair_only is not set, we will update the flag to the value in
+ param->auto_increment is bigger than the biggest key.
+ */
+
+void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
+ my_bool repair_only)
+{
+ uchar *record;
+ DBUG_ENTER("update_auto_increment_key");
+
+ if (!info->s->base.auto_key ||
+ ! maria_is_key_active(info->s->state.key_map, info->s->base.auto_key - 1))
+ {
+ if (!(param->testflag & T_VERY_SILENT))
+ _ma_check_print_info(param,
+ "Table: %s doesn't have an auto increment key\n",
+ param->isam_file_name);
+ DBUG_VOID_RETURN;
+ }
+ if (!(param->testflag & T_SILENT) &&
+ !(param->testflag & T_REP))
+ printf("Updating MARIA file: %s\n", param->isam_file_name);
+ /*
+ We have to use an allocated buffer instead of info->rec_buff as
+ _ma_put_key_in_record() may use info->rec_buff
+ */
+ if (!(record= (uchar*) my_malloc((uint) info->s->base.pack_reclength,
+ MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for extra record");
+ DBUG_VOID_RETURN;
+ }
+
+ maria_extra(info,HA_EXTRA_KEYREAD,0);
+ if (maria_rlast(info, record, info->s->base.auto_key-1))
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ my_free((char*) record, MYF(0));
+ _ma_check_print_error(param,"%d when reading last record",my_errno);
+ DBUG_VOID_RETURN;
+ }
+ if (!repair_only)
+ info->s->state.auto_increment=param->auto_increment_value;
+ }
+ else
+ {
+ ulonglong auto_increment= ma_retrieve_auto_increment(info, record);
+ set_if_bigger(info->s->state.auto_increment,auto_increment);
+ if (!repair_only)
+ set_if_bigger(info->s->state.auto_increment, param->auto_increment_value);
+ }
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ my_free((char*) record, MYF(0));
+ maria_update_state_info(param, info, UPDATE_AUTO_INC);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Update statistics for each part of an index
+
+ SYNOPSIS
+ maria_update_key_parts()
+ keyinfo IN Index information (only key->keysegs used)
+ rec_per_key_part OUT Store statistics here
+ unique IN Array of (#distinct tuples)
+ notnull_tuples IN Array of (#tuples), or NULL
+ records Number of records in the table
+
+ DESCRIPTION
+ This function is called produce index statistics values from unique and
+ notnull_tuples arrays after these arrays were produced with sequential
+ index scan (the scan is done in two places: chk_index() and
+ sort_key_write()).
+
+ This function handles all 3 index statistics collection methods.
+
+ Unique is an array:
+ unique[0]= (#different values of {keypart1}) - 1
+ unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1
+ ...
+
+ For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too:
+ notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL)
+ notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all
+ keypart{i} are not NULL)
+ ...
+ For all other statistics collection methods notnull_tuples==NULL.
+
+ Output is an array:
+ rec_per_key_part[k] =
+ = E(#records in the table such that keypart_1=c_1 AND ... AND
+ keypart_k=c_k for arbitrary constants c_1 ... c_k)
+
+ = {assuming that values have uniform distribution and index contains all
+ tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from
+ index tuples}
+
+ = #tuples-in-the-index / #distinct-tuples-in-the-index.
+
+ The #tuples-in-the-index and #distinct-tuples-in-the-index have different
+ meaning depending on which statistics collection method is used:
+
+ MI_STATS_METHOD_* how are nulls compared? which tuples are counted?
+ NULLS_EQUAL NULL == NULL all tuples in table
+ NULLS_NOT_EQUAL NULL != NULL all tuples in table
+ IGNORE_NULLS n/a tuples that don't have NULLs
+*/
+
+void maria_update_key_parts(MARIA_KEYDEF *keyinfo, ulong *rec_per_key_part,
+ ulonglong *unique, ulonglong *notnull,
+ ulonglong records)
+{
+ ulonglong count=0,tmp, unique_tuples;
+ ulonglong tuples= records;
+ uint parts;
+ for (parts=0 ; parts < keyinfo->keysegs ; parts++)
+ {
+ count+=unique[parts];
+ unique_tuples= count + 1;
+ if (notnull)
+ {
+ tuples= notnull[parts];
+ /*
+ #(unique_tuples not counting tuples with NULLs) =
+ #(unique_tuples counting tuples with NULLs as different) -
+ #(tuples with NULLs)
+ */
+ unique_tuples -= (records - notnull[parts]);
+ }
+
+ if (unique_tuples == 0)
+ tmp= 1;
+ else if (count == 0)
+ tmp= tuples; /* 1 unique tuple */
+ else
+ tmp= (tuples + unique_tuples/2) / unique_tuples;
+
+ /*
+ for some weird keys (e.g. FULLTEXT) tmp can be <1 here.
+ let's ensure it is not
+ */
+ set_if_bigger(tmp,1);
+ if (tmp >= (ulonglong) ~(ulong) 0)
+ tmp=(ulonglong) ~(ulong) 0;
+
+ *rec_per_key_part=(ulong) tmp;
+ rec_per_key_part++;
+ }
+}
+
+
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length)
+{
+ ha_checksum crc;
+ const uchar *end=buf+length;
+ for (crc=0; buf != end; buf++)
+ crc=((crc << 1) + *((uchar*) buf)) +
+ test(crc & (((ha_checksum) 1) << (8*sizeof(ha_checksum)-1)));
+ return crc;
+}
+
+static my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows)
+{
+ uint key_maxlength=key->maxlength;
+ if (key->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+ key->seg->charset->mbmaxlen;
+ key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ }
+ return (key->flag & HA_SPATIAL) ||
+ (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) &&
+ ((ulonglong) rows * key_maxlength >
+ (ulonglong) maria_max_temp_length));
+}
+
+/*
+ Deactivate all not unique index that can be recreated fast
+ These include packed keys on which sorting will use more temporary
+ space than the max allowed file length or for which the unpacked keys
+ will take much more space than packed keys.
+ Note that 'rows' may be zero for the case when we don't know how many
+ rows we will put into the file.
+ */
+
+void maria_disable_non_unique_index(MARIA_HA *info, ha_rows rows)
+{
+ MARIA_SHARE *share=info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ uint i;
+
+ DBUG_ASSERT(info->state->records == 0 &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES));
+ for (i=0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!(key->flag & (HA_NOSAME | HA_SPATIAL | HA_AUTO_KEY)) &&
+ ! maria_too_big_key_for_sort(key,rows) && info->s->base.auto_key != i+1)
+ {
+ maria_clear_key_active(share->state.key_map, i);
+ info->update|= HA_STATE_CHANGED;
+ }
+ }
+}
+
+
+/*
+ Return TRUE if we can use repair by sorting
+ One can set the force argument to force to use sorting
+ even if the temporary file would be quite big!
+*/
+
+my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows,
+ ulonglong key_map, my_bool force)
+{
+ MARIA_SHARE *share=info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ uint i;
+
+ /*
+ maria_repair_by_sort only works if we have at least one key. If we don't
+ have any keys, we should use the normal repair.
+ */
+ if (! maria_is_any_key_active(key_map))
+ return FALSE; /* Can't use sort */
+ /* QQ: Remove this when maria_repair_by_sort() works with block format */
+ if (info->s->data_file_type == BLOCK_RECORD)
+ return FALSE;
+ for (i=0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!force && maria_too_big_key_for_sort(key,rows))
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+static void
+set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share)
+{
+ if ((sort_info->new_data_file_type=share->data_file_type) ==
+ COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK)
+ {
+ MARIA_SHARE tmp;
+ sort_info->new_data_file_type= share->state.header.org_data_file_type;
+ /* Set delete_function for sort_delete_record() */
+ tmp= *share;
+ tmp.state.header.data_file_type= tmp.state.header.org_data_file_type;
+ tmp.options= ~HA_OPTION_COMPRESS_RECORD;
+ _ma_setup_functions(&tmp);
+ share->delete_record=tmp.delete_record;
+ }
+}
+
+static void restore_data_file_type(MARIA_SHARE *share)
+{
+ share->options&= ~HA_OPTION_COMPRESS_RECORD;
+ mi_int2store(share->state.header.options,share->options);
+ share->state.header.data_file_type=
+ share->state.header.org_data_file_type;
+ share->data_file_type= share->state.header.data_file_type;
+ share->pack.header_length= 0;
+}
+
+
+static void change_data_file_descriptor(MARIA_HA *info, File new_file)
+{
+ my_close(info->dfile.file, MYF(MY_WME));
+ info->dfile.file= info->s->bitmap.file.file= new_file;
+}
+
+
+/*
+ Copy all states that has to do with the data file
+
+ NOTES
+ This is done to copy the state from the data file generated from
+ repair to the original handler
+*/
+
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+ MARIA_STATE_INFO *from)
+{
+ to->state.records= from->state.records;
+ to->state.del= from->state.del;
+ to->state.empty= from->state.empty;
+ to->state.data_file_length= from->state.data_file_length;
+ to->split= from->split;
+ to->dellink= from->dellink;
+ to->first_bitmap_with_space= from->first_bitmap_with_space;
+}
+
+
+/*
+ Read 'safely' next record while scanning table.
+
+ SYNOPSIS
+ _ma_safe_scan_block_record()
+ info Maria handler
+ record Store found here
+
+ NOTES
+ - One must have called mi_scan() before this
+
+ Differences compared to _ma_scan_block_records() are:
+ - We read all blocks, not only blocks marked by the bitmap to be safe
+ - In case of errors, next read will read next record.
+ - More sanity checks
+
+ RETURN
+ 0 ok
+ HA_ERR_END_OF_FILE End of file
+ # error number
+*/
+
+
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+ MARIA_HA *info, uchar *record)
+{
+ uint record_pos= info->cur_row.nextpos;
+ ulonglong page= sort_info->page;
+ DBUG_ENTER("_ma_safe_scan_block_record");
+
+ for (;;)
+ {
+ /* Find next row in current page */
+ if (likely(record_pos < info->scan.number_of_rows))
+ {
+ uint length, offset;
+ uchar *data, *end_of_data;
+ char llbuff[22];
+
+ while (!(offset= uint2korr(info->scan.dir)))
+ {
+ info->scan.dir-= DIR_ENTRY_SIZE;
+ record_pos++;
+ if (info->scan.dir < info->scan.dir_end)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Wrong directory on page: %s",
+ llstr(page, llbuff));
+ goto read_next_page;
+ }
+ }
+ /* found row */
+ info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+ info->cur_row.nextpos= record_pos + 1;
+ data= info->scan.page_buff + offset;
+ length= uint2korr(info->scan.dir + 2);
+ end_of_data= data + length;
+ info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */
+
+ if (end_of_data > info->scan.dir_end ||
+ offset < PAGE_HEADER_SIZE || length < info->s->base.min_block_length)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Wrong directory entry %3u at page %s",
+ record_pos, llstr(page, llbuff));
+ record_pos++;
+ continue;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+ DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+ }
+ }
+
+read_next_page:
+ /* Read until we find next head page */
+ for (;;)
+ {
+ uint page_type;
+ char llbuff[22];
+
+ sort_info->page++; /* In case of errors */
+ page++;
+ if (!(page % info->s->bitmap.pages_covered))
+ page++; /* Skip bitmap */
+ if ((page + 1) * info->s->block_size > sort_info->filelength)
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ if (!(pagecache_read(info->s->pagecache,
+ &info->dfile,
+ page, 0, info->scan.page_buff,
+ PAGECACHE_READ_UNKNOWN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+
+ page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] &
+ PAGE_TYPE_MASK);
+ if (page_type == HEAD_PAGE)
+ {
+ if ((info->scan.number_of_rows=
+ (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0)
+ break;
+ _ma_check_print_info(sort_info->param,
+ "Wrong head page at %s",
+ llstr(page * info->s->block_size, llbuff));
+ }
+ else if (page_type >= MAX_PAGE_TYPE)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Found wrong page type: %d at %s",
+ page_type, llstr(page * info->s->block_size,
+ llbuff));
+ }
+ }
+
+ /* New head page */
+ info->scan.dir= (info->scan.page_buff + info->s->block_size -
+ PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+ info->scan.dir_end= (info->scan.dir -
+ (info->scan.number_of_rows - 1) *
+ DIR_ENTRY_SIZE);
+ info->scan.row_base_page= ma_recordpos(page, 0);
+ record_pos= 0;
+ }
+}
+
+
+/**
+ @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn
+ and is_of_horizon
+
+ REPAIR/OPTIMIZE have replaced the data/index file with a new file
+ and so, in this scenario:
+ @verbatim
+ CHECKPOINT - REDO_INSERT - COMMIT - ... - REPAIR - ... - crash
+ @endverbatim
+ we do not want Recovery to apply the REDO_INSERT to the table, as it would
+ then possibly wrongly extend the table. By updating create_rename_lsn at
+ the end of REPAIR, we know that REDO_INSERT will be skipped.
+
+ @param param description of the REPAIR operation
+ @param info table
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ /* in case this is maria_chk or recovery... */
+ if (translog_inited && !maria_in_recovery)
+ {
+ /*
+ For now this record is only informative. It could serve when applying
+ logs to a backup, but that needs more thought. Assume table became
+ corrupted. It is repaired, then some writes happen to it.
+ Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE
+ record. For it to give the same result as originally, the table should
+ be corrupted the same way, so applying previous REDOs should produce the
+ same corruption; that's really not guaranteed (different execution paths
+ in execution of REDOs vs runtime code so not same bugs hit, temporary
+ hardware issues not repeatable etc). Corruption may not be repeatable.
+ A reasonable solution is to execute the REDO_REPAIR_TABLE record and
+ check if the checksum of the resulting table matches what it was at the
+ end of the original repair (should be stored in log record); or execute
+ the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches
+ was it was at the start of the original repair (should be stored in log
+ record).
+ */
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[LSN_STORE_SIZE];
+ LSN lsn;
+ compile_time_assert(LSN_STORE_SIZE >= (FILEID_STORE_SIZE + 4));
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= FILEID_STORE_SIZE + 4;
+ /*
+ testflag gives an idea of what REPAIR did (in particular T_QUICK
+ or not: did it touch the data file or not?).
+ */
+ int4store(log_data + FILEID_STORE_SIZE, param->testflag);
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE,
+ &dummy_transaction_object, info,
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data) ||
+ translog_flush(lsn)))
+ return 1;
+ /*
+ The table's existence was made durable earlier (MY_SYNC_DIR passed to
+ maria_change_to_newfile()). _ma_flush_table_files_after_repair() was
+ called earlier, flushed and forced data+index+state. Old REDOs should
+ not be applied to the table:
+ */
+ if (_ma_update_create_rename_lsn(share, lsn, TRUE))
+ return 1;
+ }
+ return 0;
+}
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
new file mode 100644
index 00000000000..4446285fce9
--- /dev/null
+++ b/storage/maria/ma_checkpoint.c
@@ -0,0 +1,1108 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3071 Maria checkpoint
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+/**
+ @todo RECOVERY BUG this is unreviewed code, but used in safe conditions:
+ ha_maria takes a checkpoint at end of recovery and one at clean shutdown,
+ that's all. So there never are open tables, dirty pages, transactions.
+*/
+/*
+ Summary:
+ checkpoints are done either by a background thread (checkpoint every Nth
+ second) or by a client.
+ In ha_maria, it's not made available to clients, and will soon be done by a
+ background thread (periodically taking checkpoints and flushing dirty
+ pages).
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include "trnman.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_loghandler_lsn.h"
+
+
+/*
+ Checkpoints currently happen only at ha_maria's startup (after recovery) and
+ at shutdown, always when there is no open tables.
+ Background page flushing is not used.
+ So, needed pagecache functions for doing this flushing are not yet pushed.
+*/
+#define flush_pagecache_blocks_with_filter(A,B,C,D,E) (int)(((ulong)D) * 0)
+/**
+ filter has to return 0, 1 or 2: 0 means "don't flush this page", 1 means
+ "flush it", 2 means "don't flush this page and following pages".
+ Will move to ma_pagecache.h
+*/
+typedef int (*PAGECACHE_FILTER)(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+
+
+/** @brief type of checkpoint currently running */
+static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_checkpoint;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_checkpoint;
+/** @brief if checkpoint module was inited or not */
+static my_bool checkpoint_inited= FALSE;
+/** @brief 'kill' flag for the background checkpoint thread */
+static int checkpoint_thread_die;
+/* is ulong like pagecache->blocks_changed */
+static ulong pages_to_flush_before_next_checkpoint;
+static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
+ *dfiles_end; /**< list of data files ends here */
+static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
+ *kfiles_end; /**< list of index files ends here */
+/* those two statistics below could serve in SHOW GLOBAL STATUS */
+static uint checkpoints_total= 0, /**< all checkpoint requests made */
+ checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
+
+struct st_filter_param
+{
+ my_bool is_data_file; /**< is the file about data or index */
+ LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
+ ulong pages_covered_by_bitmap; /**< to know which page is a bitmap page */
+ uint max_pages; /**< stop after flushing this number pages */
+}; /**< information to determine which dirty pages should be flushed */
+
+static int filter_flush_data_file_medium(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+static int filter_flush_data_file_full(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+static int filter_flush_data_file_indirect(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+static int filter_flush_data_file_evenly(enum pagecache_page_type type,
+ pgcache_page_no_t pageno,
+ LSN rec_lsn, void *arg);
+static int really_execute_checkpoint(void);
+pthread_handler_t ma_checkpoint_background(void *arg);
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
+
+/**
+ @brief Does a checkpoint
+
+ @param level what level of checkpoint to do
+ @param no_wait if another checkpoint of same or stronger level
+ is already running, consider our job done
+
+ @note In ha_maria, there can never be two threads trying a checkpoint at
+ the same time.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
+{
+ int result= 0;
+ DBUG_ENTER("ma_checkpoint_execute");
+
+ if (!checkpoint_inited)
+ {
+ /*
+ If ha_maria failed to start, maria_panic_hton is called, we come here.
+ */
+ DBUG_RETURN(0);
+ }
+ DBUG_ASSERT(level > CHECKPOINT_NONE);
+
+ /* look for already running checkpoints */
+ pthread_mutex_lock(&LOCK_checkpoint);
+ while (checkpoint_in_progress != CHECKPOINT_NONE)
+ {
+ if (no_wait && (checkpoint_in_progress >= level))
+ {
+ /*
+ If we are the checkpoint background thread, we don't wait (it's
+ smarter to flush pages instead of waiting here while the other thread
+ finishes its checkpoint).
+ */
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ goto end;
+ }
+ pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
+ }
+
+ checkpoint_in_progress= level;
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ /* from then on, we are sure to be and stay the only checkpointer */
+
+ result= really_execute_checkpoint();
+ pthread_cond_broadcast(&COND_checkpoint);
+end:
+ DBUG_RETURN(result);
+}
+
+
+/**
+ @brief Does a checkpoint, really; expects no other checkpoints
+ running.
+
+ Checkpoint level requested is read from checkpoint_in_progress.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+static int really_execute_checkpoint(void)
+{
+ uint i, error= 0;
+ /** @brief checkpoint_start_log_horizon will be stored there */
+ char *ptr;
+ LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
+ LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
+ TRANSLOG_ADDRESS checkpoint_start_log_horizon;
+ uchar checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
+ DBUG_ENTER("really_execute_checkpoint");
+ bzero(&record_pieces, sizeof(record_pieces));
+
+ /*
+ STEP 1: record current end-of-log position using log's lock. It is
+ critical for the correctness of Checkpoint (related to memory visibility
+ rules, the log's lock is a mutex).
+ "Horizon" is a lower bound of the LSN of the next log record.
+ */
+ /**
+ @todo RECOVERY BUG
+ this is an horizon, but it is used as a LSN (REDO phase may start from
+ there! probably log handler would refuse to read then;
+ Sanja proposed to make a loghandler's function which finds the LSN after
+ this horizon.
+ */
+ checkpoint_start_log_horizon= translog_get_horizon();
+ DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
+ LSN_IN_PARTS(checkpoint_start_log_horizon)));
+ lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
+
+
+ /*
+ STEP 2: fetch information about transactions.
+ We must fetch transactions before dirty pages. Indeed, a transaction
+ first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
+ to 0. If we fetched pages first, we may see no dirty page yet, then we
+ fetch transactions but the transaction has already reset its rec_lsn to 0
+ so we miss rec_lsn again.
+ For a similar reason (over-allocated bitmap pages) we have to fetch
+ transactions before flushing bitmap pages.
+
+ min_trn_rec_lsn will serve to lower the starting point of the REDO phase
+ (down from checkpoint_start_log_horizon).
+ */
+ if (unlikely(trnman_collect_transactions(&record_pieces[0],
+ &record_pieces[1],
+ &min_trn_rec_lsn,
+ &min_first_undo_lsn)))
+ goto err;
+
+
+ /* STEP 3: fetch information about table files */
+ if (unlikely(collect_tables(&record_pieces[2],
+ checkpoint_start_log_horizon)))
+ goto err;
+
+
+ /* STEP 4: fetch information about dirty pages */
+ /*
+ It's better to do it _after_ having flushed some data pages (which
+ collect_tables() may have done), because those are now non-dirty and so we
+ have a more up-to-date dirty pages list to put into the checkpoint record,
+ and thus we will have less work at Recovery.
+ */
+ /* Using default pagecache for now */
+ if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
+ &record_pieces[3],
+ &min_page_rec_lsn)))
+ goto err;
+
+
+ /* LAST STEP: now write the checkpoint log record */
+ {
+ LSN lsn;
+ uint total_rec_length;
+ /*
+ the log handler is allowed to modify "str" and "length" (but not "*str")
+ of its argument, so we must not pass it record_pieces directly,
+ otherwise we would later not know what memory pieces to my_free().
+ */
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
+ checkpoint_start_log_horizon_char;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
+ sizeof(checkpoint_start_log_horizon_char);
+ for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+ {
+ log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]= record_pieces[i];
+ total_rec_length+= record_pieces[i].length;
+ }
+
+ if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
+ &dummy_transaction_object, NULL,
+ total_rec_length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL) ||
+ translog_flush(lsn)))
+ goto err;
+
+ translog_lock();
+ /*
+ This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
+ such hook would be called before translog_flush (and we must be sure
+ that log was flushed before we write to the control file).
+ */
+ if (unlikely(ma_control_file_write_and_force(lsn, FILENO_IMPOSSIBLE,
+ CONTROL_FILE_UPDATE_ONLY_LSN)))
+ {
+ translog_unlock();
+ goto err;
+ }
+ translog_unlock();
+ }
+
+ /*
+ Note that we should not alter memory structures until we have successfully
+ written the checkpoint record and control file.
+ */
+ /* checkpoint succeeded */
+ ptr= record_pieces[3].str;
+ pages_to_flush_before_next_checkpoint= uint4korr(ptr);
+ DBUG_PRINT("info",("%u pages to flush before next checkpoint",
+ (uint)pages_to_flush_before_next_checkpoint));
+
+ /* compute log's low-water mark */
+ TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
+ set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
+ set_if_smaller(log_low_water_mark, min_first_undo_lsn);
+ set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
+ /**
+ Now purge unneeded logs.
+ As some systems have an unreliable fsync (drive lying), we could try to
+ be robust against that: remember a few previous checkpoints in the
+ control file, and not purge logs immediately... Think about it.
+ */
+#if 0 /* purging/keeping will be an option */
+ if (translog_purge(log_low_water_mark))
+ fprintf(stderr, "Maria engine: log purge failed\n"); /* not deadly */
+#endif
+
+ goto end;
+
+err:
+ error= 1;
+ fprintf(stderr, "Maria engine: checkpoint failed\n"); /* TODO: improve ;) */
+ /* we were possibly not able to determine what pages to flush */
+ pages_to_flush_before_next_checkpoint= 0;
+
+end:
+ for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+ my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
+ pthread_mutex_lock(&LOCK_checkpoint);
+ checkpoint_in_progress= CHECKPOINT_NONE;
+ checkpoints_total++;
+ checkpoints_ok_total+= !error;
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief Initializes the checkpoint module
+
+ @param create_background_thread If one wants the module to now create a
+ thread which will periodically do
+ checkpoints, and flush dirty pages, in the
+ background.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int ma_checkpoint_init(my_bool create_background_thread)
+{
+ pthread_t th;
+ int res= 0;
+ DBUG_ENTER("ma_checkpoint_init");
+ checkpoint_inited= TRUE;
+ checkpoint_thread_die= 2; /* not yet born == dead */
+ if (pthread_mutex_init(&LOCK_checkpoint, MY_MUTEX_INIT_SLOW) ||
+ pthread_cond_init(&COND_checkpoint, 0))
+ res= 1;
+ else if (create_background_thread)
+ {
+ if (!(res= pthread_create(&th, NULL, ma_checkpoint_background, NULL)))
+ checkpoint_thread_die= 0; /* thread lives, will have to be killed */
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Destroys the checkpoint module
+*/
+
+void ma_checkpoint_end(void)
+{
+ DBUG_ENTER("ma_checkpoint_end");
+ if (checkpoint_inited)
+ {
+ pthread_mutex_lock(&LOCK_checkpoint);
+ if (checkpoint_thread_die != 2) /* thread was started ok */
+ {
+ DBUG_PRINT("info",("killing Maria background checkpoint thread"));
+ checkpoint_thread_die= 1; /* kill it */
+ do /* and wait for it to be dead */
+ {
+ /* wake it up if it was in a sleep */
+ pthread_cond_broadcast(&COND_checkpoint);
+ DBUG_PRINT("info",("waiting for Maria background checkpoint thread"
+ " to die"));
+ pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
+ }
+ while (checkpoint_thread_die != 2);
+ }
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
+ pthread_mutex_destroy(&LOCK_checkpoint);
+ pthread_cond_destroy(&COND_checkpoint);
+ checkpoint_inited= FALSE;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief dirty-page filtering criteria for MEDIUM checkpoint.
+
+ We flush data/index pages which have been dirty since the previous
+ checkpoint (this is the two-checkpoint rule: the REDO phase will not have
+ to start from earlier than the next-to-last checkpoint), and all dirty
+ bitmap pages.
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+
+ @return Operation status
+ @retval 0 don't flush the page
+ @retval 1 flush the page
+*/
+
+static int filter_flush_data_file_medium(enum pagecache_page_type type,
+ pgcache_page_no_t pageno,
+ LSN rec_lsn, void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ return ((type == PAGECACHE_LSN_PAGE) &&
+ (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) ||
+ (param->is_data_file &&
+ ((pageno % param->pages_covered_by_bitmap) == 0));
+}
+
+
+/**
+ @brief dirty-page filtering criteria for FULL checkpoint.
+
+ We flush all dirty data/index pages and all dirty bitmap pages.
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+
+ @return Operation status
+ @retval 0 don't flush the page
+ @retval 1 flush the page
+*/
+
+static int filter_flush_data_file_full(enum pagecache_page_type type,
+ pgcache_page_no_t pageno,
+ LSN rec_lsn
+ __attribute__ ((unused)),
+ void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ return (type == PAGECACHE_LSN_PAGE) ||
+ (param->is_data_file &&
+ ((pageno % param->pages_covered_by_bitmap) == 0));
+}
+
+
+/**
+ @brief dirty-page filtering criteria for INDIRECT checkpoint.
+
+ We flush all dirty bitmap pages.
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+
+ @return Operation status
+ @retval 0 don't flush the page
+ @retval 1 flush the page
+*/
+
+static int filter_flush_data_file_indirect(enum pagecache_page_type type
+ __attribute__ ((unused)),
+ pgcache_page_no_t pageno,
+ LSN rec_lsn
+ __attribute__ ((unused)),
+ void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ return
+ (param->is_data_file &&
+ ((pageno % param->pages_covered_by_bitmap) == 0));
+}
+
+
+/**
+ @brief dirty-page filtering criteria for background flushing thread.
+
+ We flush data pages which have been dirty since the previous checkpoint
+ (this is the two-checkpoint rule: the REDO phase will not have to start
+ from earlier than the next-to-last checkpoint), and all dirty bitmap
+ pages. But we flush no more than a certain number of pages (to have an
+ even flushing, no write burst).
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+
+ @return Operation status
+ @retval 0 don't flush the page
+ @retval 1 flush the page
+ @retval 2 don't flush the page and following pages
+*/
+
+static int filter_flush_data_file_evenly(enum pagecache_page_type type,
+ pgcache_page_no_t pageno
+ __attribute__ ((unused)),
+ LSN rec_lsn, void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ if (unlikely(param->max_pages == 0)) /* all flushed already */
+ return 2;
+ if ((type == PAGECACHE_LSN_PAGE) &&
+ (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
+ {
+ param->max_pages--;
+ return 1;
+ }
+ return 0;
+}
+
+
+/**
+ @brief Background thread which does checkpoints and flushes periodically.
+
+ Takes a checkpoint every 30th second. After taking a checkpoint, all pages
+ dirty at the time of that checkpoint are flushed evenly until it is time to
+ take another checkpoint (30 seconds later). This ensures that the REDO
+ phase starts at earliest (in LSN time) at the next-to-last checkpoint
+ record ("two-checkpoint rule").
+
+ @note MikaelR questioned why the same thread does two different jobs, the
+ risk could be that while a checkpoint happens no LRD flushing happens.
+
+ @note MikaelR noted that he observed that Linux's file cache may never
+ fsync to disk until this cache is full, at which point it decides to empty
+ the cache, making the machine very slow. A solution was to fsync after
+ writing 2 MB.
+*/
+
+pthread_handler_t ma_checkpoint_background(void *arg __attribute__((unused)))
+{
+ const uint sleep_unit= 1 /* 1 second */,
+ time_between_checkpoints= 30, /* 30 sleep units */
+ /** @brief At least this of log/page bytes written between checkpoints */
+ checkpoint_min_activity= 2*1024*1024;
+ uint sleeps= 0;
+
+ my_thread_init();
+ DBUG_PRINT("info",("Maria background checkpoint thread starts"));
+ for(;;)
+ {
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+ sleeps=0;
+#endif
+ uint pages_bunch_size;
+ struct st_filter_param filter_param;
+ PAGECACHE_FILE *dfile; /**< data file currently being flushed */
+ PAGECACHE_FILE *kfile; /**< index file currently being flushed */
+ TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= LSN_IMPOSSIBLE;
+ ulonglong pagecache_flushes_at_last_checkpoint= 0;
+ struct timespec abstime;
+ switch((sleeps++) % time_between_checkpoints)
+ {
+ case 0:
+ /*
+ With background flushing evenly distributed over the time
+ between two checkpoints, we should have only little flushing to do
+ in the checkpoint.
+ */
+ /*
+ No checkpoint if little work of interest for recovery was done
+ since last checkpoint. Such work includes log writing (lengthens
+ recovery, checkpoint would shorten it), page flushing (checkpoint
+ would decrease the amount of read pages in recovery).
+ */
+ if (((translog_get_horizon() - log_horizon_at_last_checkpoint) +
+ (maria_pagecache->global_cache_write -
+ pagecache_flushes_at_last_checkpoint) *
+ maria_pagecache->block_size) < checkpoint_min_activity)
+ {
+ /* don't take checkpoint, so don't know what to flush */
+ pages_to_flush_before_next_checkpoint= 0;
+ break;
+ }
+ ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
+ /*
+ Snapshot this kind of "state" of the engine. Note that the value below
+ is possibly greater than last_checkpoint_lsn.
+ */
+ log_horizon_at_last_checkpoint= translog_get_horizon();
+ pagecache_flushes_at_last_checkpoint=
+ maria_pagecache->global_cache_write;
+ /*
+ If the checkpoint above succeeded it has set d|kfiles and
+ d|kfiles_end. If is has failed, it has set
+ pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
+ and sleep until the next checkpoint.
+ */
+ break;
+ case 1:
+ /* set up parameters for background page flushing */
+ filter_param.up_to_lsn= last_checkpoint_lsn;
+ pages_bunch_size= pages_to_flush_before_next_checkpoint /
+ time_between_checkpoints;
+ dfile= dfiles;
+ kfile= kfiles;
+ /* fall through */
+ default:
+ if (pages_bunch_size > 0)
+ {
+ /* flush a bunch of dirty pages */
+ filter_param.max_pages= pages_bunch_size;
+ filter_param.is_data_file= TRUE;
+ while (dfile != dfiles_end)
+ {
+ int res=
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ dfile, FLUSH_KEEP,
+ filter_flush_data_file_evenly,
+ &filter_param);
+ /* note that it may just be a pinned page */
+ if (unlikely(res))
+ fprintf(stderr, "Maria engine: warning - background page flush"
+ " failed\n");
+ if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+ break; /* and we will continue with the same file */
+ dfile++; /* otherwise all this file is flushed, move to next file */
+ }
+ filter_param.is_data_file= FALSE;
+ while (kfile != kfiles_end)
+ {
+ int res=
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ dfile, FLUSH_KEEP,
+ filter_flush_data_file_evenly,
+ &filter_param);
+ if (unlikely(res))
+ fprintf(stderr, "Maria engine: warning - background page flush"
+ " failed\n");
+ if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+ break; /* and we will continue with the same file */
+ kfile++; /* otherwise all this file is flushed, move to next file */
+ }
+ }
+ }
+ pthread_mutex_lock(&LOCK_checkpoint);
+ if (checkpoint_thread_die == 1)
+ break;
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ my_sleep(100000); // a tenth of a second
+ pthread_mutex_lock(&LOCK_checkpoint);
+#else
+ /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
+ set_timespec(abstime, sleep_unit);
+ pthread_cond_timedwait(&COND_checkpoint, &LOCK_checkpoint, &abstime);
+#endif
+ if (checkpoint_thread_die == 1)
+ break;
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ }
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ DBUG_PRINT("info",("Maria background checkpoint thread ends"));
+ /*
+ A last checkpoint, now that all tables should be closed; to have instant
+ recovery later. We always do it, because the test above about number of
+ log records or flushed pages is only approximative. For example, some log
+ records may have been written while ma_checkpoint_execute() above was
+ running, or some pages may have been flushed during this time. Thus it
+ could be that, while nothing has changed since that checkpoint's *end*, if
+ we recovered from that checkpoint we would have a non-empty dirty pages
+ list, REDOs to execute, and we don't want that, we want a clean shutdown
+ to have an empty recovery (simplifies upgrade/backups: one can just do a
+ clean shutdown, copy its tables to another system without copying the log
+ or control file and it will work because recovery will not need those).
+ Another reason why it's approximative is that a log record may have been
+ written above between ma_checkpoint_execute() and the
+ tranlog_get_horizon() which follows.
+ So, we have at least two checkpoints per start/stop of the engine, and
+ only two if the engine stays idle.
+ */
+ ma_checkpoint_execute(CHECKPOINT_FULL, FALSE);
+ pthread_mutex_lock(&LOCK_checkpoint);
+ checkpoint_thread_die= 2; /* indicate that we are dead */
+ /* wake up ma_checkpoint_end() which may be waiting for our death */
+ pthread_cond_broadcast(&COND_checkpoint);
+ /* broadcast was inside unlock because ma_checkpoint_end() destroys mutex */
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ my_thread_end();
+ return 0;
+}
+
+
+/**
+ @brief Allocates buffer and stores in it some info about open tables,
+ does some flushing on those.
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ The caller is taking a checkpoint.
+
+ @param[out] str pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about open tables
+ @param checkpoint_start_log_horizon Of the in-progress checkpoint
+ record.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
+{
+ MARIA_SHARE **distinct_shares= NULL;
+ char *ptr;
+ uint error= 1, sync_error= 0, nb, nb_stored, i;
+ my_bool unmark_tables= TRUE;
+ uint total_names_length;
+ LIST *pos; /**< to iterate over open tables */
+ struct st_state_copy {
+ uint index;
+ MARIA_STATE_INFO state;
+ };
+ struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
+ *state_copies_end, /**< cache ends here */
+ *state_copy; /**< iterator in cache */
+ TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
+ DBUG_ENTER("collect_tables");
+
+ /* let's make a list of distinct shares */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*)pos->data;
+ MARIA_SHARE *share= info->s;
+ /* the first three variables below can never change */
+ if (share->base.born_transactional && !share->temporary &&
+ share->mode != O_RDONLY &&
+ !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
+ {
+ /*
+ Why we didn't take intern_lock above: table had in_checkpoint==0 so no
+ thread could set in_checkpoint. And no thread needs to know that we
+ are setting in_checkpoint, because only maria_close() needs it and
+ cannot run now as we hold THR_LOCK_maria.
+ */
+ /*
+ This table is relevant for checkpoint and not already seen. Mark it,
+ so that it is not seen again in the loop.
+ */
+ nb++;
+ DBUG_ASSERT(share->in_checkpoint == 0);
+ /* This flag ensures that we count only _distinct_ shares. */
+ share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
+ }
+ }
+ if (unlikely((distinct_shares=
+ (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
+ MYF(MY_WME))) == NULL))
+ goto err;
+ for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*)pos->data;
+ MARIA_SHARE *share= info->s;
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
+ {
+ distinct_shares[i++]= share;
+ /*
+ With this we prevent the share from going away while we later flush
+ and force it without holding THR_LOCK_maria. For example if the share
+ could be my_free()d by maria_close() we would have a problem when we
+ access it to flush the table. We "pin" the share pointer.
+ And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
+ not seen again in the loop.
+ */
+ share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
+ /** @todo avoid strlen() */
+ total_names_length+= strlen(share->open_file_name);
+ }
+ }
+
+ DBUG_ASSERT(i == nb);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_PRINT("info",("found %u table shares", nb));
+
+ str->length=
+ 4 + /* number of tables */
+ (2 + /* short id */
+ 4 + /* kfile */
+ 4 + /* dfile */
+ LSN_STORE_SIZE + /* first_log_write_at_lsn */
+ 1 /* end-of-name 0 */
+ ) * nb + total_names_length;
+ if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
+ goto err;
+
+ ptr= str->str;
+ ptr+= 4; /* real number of stored tables is not yet know */
+
+ struct st_filter_param filter_param;
+ /* only possible checkpointer, so can do the read below without mutex */
+ filter_param.up_to_lsn= last_checkpoint_lsn;
+ PAGECACHE_FILTER filter;
+ switch(checkpoint_in_progress)
+ {
+ case CHECKPOINT_MEDIUM:
+ filter= &filter_flush_data_file_medium;
+ break;
+ case CHECKPOINT_FULL:
+ filter= &filter_flush_data_file_full;
+ break;
+ case CHECKPOINT_INDIRECT:
+ filter= &filter_flush_data_file_indirect;
+ break;
+ default:
+ DBUG_ASSERT(0);
+ goto err;
+ }
+
+ /*
+ The principle of reading/writing the state below is explained in
+ ma_recovery.c, look for "Recovery of the state".
+ */
+#define STATE_COPIES 1024
+ state_copies= (struct st_state_copy *)
+ my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
+ dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
+ /* avoid size of 0 for my_realloc */
+ max(1, nb) * sizeof(PAGECACHE_FILE),
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
+ /* avoid size of 0 for my_realloc */
+ max(1, nb) * sizeof(PAGECACHE_FILE),
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ if (unlikely((state_copies == NULL) ||
+ (dfiles == NULL) || (kfiles == NULL)))
+ goto err;
+ state_copy= state_copies_end= NULL;
+ dfiles_end= dfiles;
+ kfiles_end= kfiles;
+
+ for (nb_stored= 0, i= 0; i < nb; i++)
+ {
+ MARIA_SHARE *share= distinct_shares[i];
+ PAGECACHE_FILE kfile, dfile;
+ if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+ {
+ /* No need for a mutex to read the above, only us can write this flag */
+ continue;
+ }
+ DBUG_PRINT("info",("looking at table '%s'", share->open_file_name));
+ if (state_copy == state_copies_end) /* we have no more cached states */
+ {
+ /*
+ Collect and cache a bunch of states. We do this for many states at a
+ time, to not lock/unlock the log's lock too often.
+ */
+ uint j, bound= min(nb, i + STATE_COPIES);
+ state_copy= state_copies;
+ /* part of the state is protected by log's lock */
+ translog_lock();
+ state_copies_horizon= translog_get_horizon_no_lock();
+ for (j= i; j < bound; j++)
+ {
+ MARIA_SHARE *share2= distinct_shares[j];
+ if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+ continue;
+ state_copy->index= j;
+ state_copy->state= share2->state; /* we copy the state */
+ state_copy++;
+ /*
+ data_file_length is not updated under log's lock by the bitmap
+ code, but writing a wrong data_file_length is ok: a next
+ maria_close() will correct it; if we crash before, Recovery will
+ set it to the true physical size.
+ */
+ }
+ translog_unlock();
+ state_copies_end= state_copy;
+ state_copy= state_copies;
+ /* so now we have cached states */
+ }
+
+ /* locate our state among these cached ones */
+ for ( ; state_copy->index != i; state_copy++)
+ DBUG_ASSERT(state_copy < state_copies_end);
+
+ filter_param.pages_covered_by_bitmap= share->bitmap.pages_covered;
+ /* OS file descriptors are ints which we stored in 4 bytes */
+ compile_time_assert(sizeof(int) == 4);
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Tables in a normal state have their two file descriptors open.
+ In some rare cases like REPAIR, some descriptor may be closed or even
+ -1. If that happened, the _ma_state_info_write() may fail. This is
+ prevented by enclosing all all places which close/change kfile.file with
+ intern_lock.
+ */
+ kfile= share->kfile;
+ dfile= share->bitmap.file;
+ /*
+ Ignore table which has no logged writes (all its future log records will
+ be found naturally by Recovery). Ignore obsolete shares (_before_
+ setting themselves to last_version=0 they already did all flush and
+ sync; if we flush their state now we may be flushing an obsolete state
+ onto a newer one (assuming the table has been reopened with a different
+ share but of course same physical index file).
+ */
+ if ((share->id != 0) && (share->last_version != 0))
+ {
+ /** @todo avoid strlen */
+ uint open_file_name_len= strlen(share->open_file_name) + 1;
+ /* remember the descriptors for background flush */
+ *(dfiles_end++)= dfile;
+ *(kfiles_end++)= kfile;
+ /* we will store this table in the record */
+ nb_stored++;
+ int2store(ptr, share->id);
+ ptr+= 2;
+ /*
+ We must store the OS file descriptors, because the pagecache, which
+ tells us the list of dirty pages, refers to these pages by OS file
+ descriptors. An alternative is to make the page cache aware of the
+ 2-byte id and of the location of a page ("is it a data file page or an
+ index file page?").
+ If one descriptor is -1, normally there should be no dirty pages
+ collected for this file, it's ok to store -1, it will not be used.
+ */
+ int4store(ptr, kfile.file);
+ ptr+= 4;
+ int4store(ptr, dfile.file);
+ ptr+= 4;
+ lsn_store(ptr, share->lsn_of_file_id);
+ ptr+= LSN_STORE_SIZE;
+ /*
+ first_bitmap_with_space is not updated under log's lock, and is
+ important. We would need the bitmap's lock to get it right. Recovery
+ of this is not clear, so we just play safe: write it out as
+ unknown: if crash, _ma_bitmap_init() at next open (for example in
+ Recovery) will convert it to 0 and thus the first insertion will
+ search for free space from the file's first bitmap (0) -
+ under-optimal but safe.
+ If no crash, maria_close() will write the exact value.
+ */
+ state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
+ memcpy(ptr, share->open_file_name, open_file_name_len);
+ ptr+= open_file_name_len;
+ if (cmp_translog_addr(share->state.is_of_horizon,
+ checkpoint_start_log_horizon) >= 0)
+ {
+ /*
+ State was flushed recently, it does not hold down the log's
+ low-water mark and will not give avoidable work to Recovery. So we
+ needn't flush it. Also, it is possible that while we copied the
+ state above (under log's lock, without intern_lock) it was being
+ modified in memory or flushed to disk (without log's lock, under
+ intern_lock, like in maria_extra()), so our copy may be incorrect
+ and we should not flush it.
+ It may also be a share which got last_version==0 since we checked
+ last_version; in this case, it flushed its state and the LSN test
+ above will catch it.
+ */
+ }
+ else
+ {
+ /*
+ We could do the state flush only if share->changed, but it's
+ tricky.
+ Consider a maria_write() which has written REDO,UNDO, and before it
+ calls _ma_writeinfo() (setting share->changed=1), checkpoint
+ happens and sees share->changed=0, does not flush state. It is
+ possible that Recovery does not start from before the REDO and thus
+ the state is not recovered. A solution may be to set
+ share->changed=1 under log mutex when writing log records.
+ But as anyway we have another problem below, this optimization would
+ be of little use.
+ */
+ /** @todo flush state only if changed since last checkpoint */
+ DBUG_ASSERT(share->last_version != 0);
+ state_copy->state.is_of_horizon= share->state.is_of_horizon=
+ state_copies_horizon;
+ if (kfile.file >= 0)
+ sync_error|=
+ _ma_state_info_write_sub(kfile.file, &state_copy->state, 1);
+ /*
+ We don't set share->changed=0 because it may interfere with a
+ concurrent _ma_writeinfo() doing share->changed=1 (cancel its
+ effect). The sad consequence is that we will flush the same state at
+ each checkpoint if the table was once written and then not anymore.
+ */
+ }
+ sync_error|=
+ _ma_flush_bitmap(share); /* after that, all is in page cache */
+ DBUG_ASSERT(share->pagecache == maria_pagecache);
+ }
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+ {
+ /* maria_close() left us to free the share */
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_destroy(&share->intern_lock);
+ my_free((uchar *)share, MYF(0));
+ }
+ else
+ {
+ /* share goes back to normal state */
+ share->in_checkpoint= 0;
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+
+ /*
+ We do the big disk writes out of intern_lock to not block other
+ users of this table (intern_lock is taken at the start and end of
+ every statement). This means that file descriptors may be invalid
+ (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
+ under Windows, or REPAIR). This should not be a problem as we use
+ MY_IGNORE_BADFD. Descriptors may even point to other files but then
+ the old blocks (of before the close) must have been flushed for sure,
+ so our flush will flush new blocks (of after the latest open) and that
+ should do no harm.
+ */
+ /*
+ If CHECKPOINT_MEDIUM, this big flush below may result in a
+ serious write burst. Realize that all pages dirtied between the
+ last checkpoint and the one we are doing now, will be flushed at
+ next checkpoint, except those evicted by LRU eviction (depending on
+ the size of the page cache compared to the size of the working data
+ set, eviction may be rare or frequent).
+ We avoid that burst by anticipating: those pages are flushed
+ in bunches spanned regularly over the time interval between now and
+ the next checkpoint, by a background thread. Thus the next checkpoint
+ will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
+ only a little slower than CHECKPOINT_INDIRECT).
+ */
+
+ /**
+ @todo we ignore the error because it may be just due a pinned page;
+ we should rather fix the function below to distinguish between
+ pinned page and write error. Then we can turn the warning into an
+ error.
+ */
+ if (((filter_param.is_data_file= TRUE),
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ &dfile, FLUSH_KEEP,
+ filter, &filter_param)) ||
+ ((filter_param.is_data_file= FALSE),
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ &kfile, FLUSH_KEEP,
+ filter, &filter_param)))
+ fprintf(stderr, "Maria engine: warning - checkpoint page flush"
+ " failed\n"); /** @todo improve */
+ /*
+ fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
+ per second, so if you have touched 1000 files it's 7 seconds).
+ */
+ sync_error|=
+ my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
+ my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
+ /*
+ in case of error, we continue because writing other tables to disk is
+ still useful.
+ */
+ }
+
+ if (sync_error)
+ goto err;
+ /* We maybe over-estimated (due to share->id==0 or last_version==0) */
+ DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
+ str->length= (uint)(ptr - str->str);
+ /*
+ As we support max 65k tables open at a time (2-byte short id), we
+ assume uint is enough for the cumulated length of table names; and
+ LEX_STRING::length is uint.
+ */
+ int4store(str->str, nb_stored);
+ error= unmark_tables= 0;
+
+err:
+ if (unlikely(unmark_tables))
+ {
+ /* maria_close() uses THR_LOCK_maria from start to end */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (i= 0; i < nb; i++)
+ {
+ MARIA_SHARE *share= distinct_shares[i];
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+ {
+ /* maria_close() left us to free the share */
+ pthread_mutex_destroy(&share->intern_lock);
+ my_free((uchar *)share, MYF(0));
+ }
+ else
+ {
+ /* share goes back to normal state */
+ share->in_checkpoint= 0;
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ }
+ my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h
new file mode 100644
index 00000000000..86f3779ca7a
--- /dev/null
+++ b/storage/maria/ma_checkpoint.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3071 Maria checkpoint
+ First version written by Guilhem Bichot on 2006-04-27.
+ Does not compile yet.
+*/
+
+/* This is the interface of this module. */
+
+typedef enum enum_ma_checkpoint_level {
+ CHECKPOINT_NONE= 0,
+ /* just write dirty_pages, transactions table and sync files */
+ CHECKPOINT_INDIRECT,
+ /* also flush all dirty pages which were already dirty at prev checkpoint */
+ CHECKPOINT_MEDIUM,
+ /* also flush all dirty pages */
+ CHECKPOINT_FULL
+} CHECKPOINT_LEVEL;
+
+C_MODE_START
+int ma_checkpoint_init(my_bool create_background_thread);
+void ma_checkpoint_end(void);
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait);
+C_MODE_END
+
+/**
+ @brief reads some LSNs with special trickery
+
+ If a 64-bit variable transitions between both halves being zero to both
+ halves being non-zero, and back, this function can be used to do a read of
+ it (without mutex, without atomic load) which always produces a correct
+ (though maybe slightly old) value (even on 32-bit CPUs). The value is at
+ least as new as the latest mutex unlock done by the calling thread.
+ The assumption is that the system sets both 4-byte halves either at the
+ same time, or one after the other (in any order), but NOT some bytes of the
+ first half then some bytes of the second half then the rest of bytes of the
+ first half. With this assumption, the function can detect when it is
+ seeing an inconsistent value.
+
+ @param LSN pointer to the LSN variable to read
+
+ @return LSN part (most significant byte always 0)
+*/
+#if ( SIZEOF_CHARP >= 8 )
+/* 64-bit CPU, 64-bit reads are atomic */
+#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN
+#else
+static inline LSN lsn_read_non_atomic_32(const volatile LSN *x)
+{
+ /*
+ 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+ low bits and new high bits, or the contrary).
+ */
+ for (;;) /* loop until no atomicity problems */
+ {
+ /*
+ Remove most significant byte in case this is a LSN_WITH_FLAGS object.
+ Those flags in TRN::first_undo_lsn break the condition on transitions so
+ they must be removed below.
+ */
+ LSN y= LSN_WITH_FLAGS_TO_LSN(*x);
+ if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y)))
+ return y;
+ }
+}
+#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x)
+#endif
diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c
new file mode 100644
index 00000000000..9076b3ebb86
--- /dev/null
+++ b/storage/maria/ma_checksum.c
@@ -0,0 +1,72 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Calculate a checksum for a row */
+
+#include "maria_def.h"
+
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record)
+{
+ ha_checksum crc=0;
+ MARIA_COLUMNDEF *column= info->s->columndef;
+ MARIA_COLUMNDEF *column_end= column+ info->s->base.fields;
+
+ if (info->s->base.null_bytes)
+ crc= my_checksum(crc, record, info->s->base.null_bytes);
+
+ for ( ; column != column_end ; column++)
+ {
+ const uchar *pos= record + column->offset;
+ ulong length;
+
+ if (record[column->null_pos] & column->null_bit)
+ continue; /* Null field */
+
+ switch (column->type) {
+ case FIELD_BLOB:
+ {
+ uint blob_size_length= column->length- portable_sizeof_char_ptr;
+ length= _ma_calc_blob_length(blob_size_length, pos);
+ if (length)
+ {
+ memcpy((char*) &pos, pos + blob_size_length, sizeof(char*));
+ crc= my_checksum(crc, pos, length);
+ }
+ continue;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint pack_length= column->fill_length;
+ if (pack_length == 1)
+ length= (ulong) *(uchar*) pos;
+ else
+ length= uint2korr(pos);
+ pos+= pack_length; /* Skip length information */
+ break;
+ }
+ default:
+ length= column->length;
+ break;
+ }
+ crc= my_checksum(crc, pos, length);
+ }
+ return crc;
+}
+
+
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos)
+{
+ return my_checksum(0, pos, info->s->base.reclength);
+}
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
new file mode 100644
index 00000000000..9b654803945
--- /dev/null
+++ b/storage/maria/ma_close.c
@@ -0,0 +1,156 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* close a isam-database */
+/*
+ TODO:
+ We need to have a separate mutex on the closed file to allow other threads
+ to open other files during the time we flush the cache and close this file
+*/
+
+#include "maria_def.h"
+
+int maria_close(register MARIA_HA *info)
+{
+ int error=0,flag;
+ my_bool share_can_be_freed= FALSE;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_close");
+ DBUG_PRINT("enter",("base: 0x%lx reopen: %u locks: %u",
+ (long) info, (uint) share->reopen,
+ (uint) share->tot_locks));
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+ if (info->lock_type == F_EXTRA_LCK)
+ info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */
+
+ if (share->reopen == 1 && share->kfile.file >= 0)
+ _ma_decrement_open_count(info);
+
+ if (info->lock_type != F_UNLCK)
+ {
+ if (maria_lock_database(info,F_UNLCK))
+ error=my_errno;
+ }
+ pthread_mutex_lock(&share->intern_lock);
+
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ share->r_locks--;
+ share->tot_locks--;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ if (end_io_cache(&info->rec_cache))
+ error=my_errno;
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ }
+ flag= !--share->reopen;
+ maria_open_list=list_delete(maria_open_list,&info->open_list);
+
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ (*share->end)(info);
+
+ if (flag)
+ {
+ /* Last close of file; Flush everything */
+ if (share->kfile.file >= 0)
+ {
+ if ((*share->once_end)(share))
+ error= my_errno;
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ (share->temporary ?
+ FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE)))
+ error= my_errno;
+#ifdef HAVE_MMAP
+ if (share->file_map)
+ _ma_unmap_file(info);
+#endif
+ /*
+ If we are crashed, we can safely flush the current state as it will
+ not change the crashed state.
+ We can NOT write the state in other cases as other threads
+ may be using the file at this point
+ IF using --external-locking, which does not apply to Maria.
+ */
+ if ((share->changed && share->base.born_transactional) ||
+ (share->mode != O_RDONLY && maria_is_crashed(info)))
+ {
+ /*
+ State must be written to file as it was not done at table's
+ unlocking.
+ */
+ if (_ma_state_info_write(share, 1))
+ error= my_errno;
+ }
+ /*
+ File must be synced as it is going out of the maria_open_list and so
+ becoming unknown to future Checkpoints.
+ */
+ if (my_sync(share->kfile.file, MYF(MY_WME)))
+ error= my_errno;
+ if (my_close(share->kfile.file, MYF(0)))
+ error= my_errno;
+ }
+#ifdef THREAD
+ thr_lock_delete(&share->lock);
+ {
+ int i,keys;
+ keys = share->state.header.keys;
+ VOID(rwlock_destroy(&share->mmap_lock));
+ for(i=0; i<keys; i++) {
+ VOID(rwlock_destroy(&share->key_root_lock[i]));
+ }
+ }
+#endif
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ if (share->in_checkpoint == MARIA_CHECKPOINT_LOOKS_AT_ME)
+ {
+ share->kfile.file= -1; /* because Checkpoint does not need to flush */
+ /* we cannot my_free() the share, Checkpoint would see a bad pointer */
+ share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME;
+ }
+ else
+ share_can_be_freed= TRUE;
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ pthread_mutex_unlock(&share->intern_lock);
+ if (share_can_be_freed)
+ {
+ VOID(pthread_mutex_destroy(&share->intern_lock));
+ my_free((uchar *)share, MYF(0));
+ }
+ if (info->ftparser_param)
+ {
+ my_free((uchar*)info->ftparser_param, MYF(0));
+ info->ftparser_param= 0;
+ }
+ if (info->dfile.file >= 0)
+ {
+ /*
+ This is outside of mutex so would confuse a concurrent
+ Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex.
+ */
+ if (my_close(info->dfile.file, MYF(0)))
+ error = my_errno;
+ }
+
+ my_free((uchar*) info,MYF(0));
+
+ if (error)
+ DBUG_RETURN(my_errno= error);
+ DBUG_RETURN(0);
+} /* maria_close */
diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c
new file mode 100644
index 00000000000..36ea2f6e6e4
--- /dev/null
+++ b/storage/maria/ma_commit.c
@@ -0,0 +1,124 @@
+/* Copyright (C) 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+ @brief writes a COMMIT record to log and commits transaction in memory
+
+ @param trn transaction
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk error or out of memory)
+*/
+
+int ma_commit(TRN *trn)
+{
+ int res;
+ LSN commit_lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS];
+ DBUG_ENTER("ma_commit");
+
+ if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */
+ DBUG_RETURN(trnman_rollback_trn(trn));
+ /*
+ - if COMMIT record is written before trnman_commit_trn():
+ if Checkpoint comes in the middle it will see trn is not committed,
+ then if crash, Recovery might roll back trn (if min(rec_lsn) is after
+ COMMIT record) and this is not an issue as
+ * transaction's updates were not made visible to other transactions
+ * "commit ok" was not sent to client
+ Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT
+ record), which is ok too. All in all it means that "trn committed" is not
+ 100% equal to "COMMIT record written".
+ - if COMMIT record is written after trnman_commit_trn():
+ if crash happens between the two, trn will be rolled back which is an
+ issue (transaction's updates were made visible to other transactions).
+ So we need to go the first way.
+ */
+
+ /**
+ @todo RECOVERY share's state is written to disk only in
+ maria_lock_database(), so COMMIT record is not the last record of the
+ transaction! It is probably an issue. Recovery of the state is a problem
+ not yet solved.
+ */
+ /*
+ We do not store "thd->transaction.xid_state.xid" for now, it will be
+ needed only when we support XA.
+ */
+ res= (translog_write_record(&commit_lsn, LOGREC_COMMIT,
+ trn, NULL, 0,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL) ||
+ translog_flush(commit_lsn) ||
+ trnman_commit_trn(trn));
+ /*
+ Note: if trnman_commit_trn() fails above, we have already
+ written the COMMIT record, so Checkpoint and Recovery will see the
+ transaction as committed.
+ */
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Writes a COMMIT record for a transaciton associated with a file
+
+ @param info Maria handler
+
+ @return Operation status
+ @retval 0 ok
+ @retval # error (disk error or out of memory)
+*/
+
+int maria_commit(MARIA_HA *info)
+{
+ return info->s->now_transactional ? ma_commit(info->trn) : 0;
+}
+
+
+/**
+ @brief Starts a transaction on a file handle
+
+ @param info Maria handler
+
+ @return Operation status
+ @retval 0 ok
+ @retval # Error code.
+*/
+
+
+int maria_begin(MARIA_HA *info)
+{
+ DBUG_ENTER("maria_begin");
+
+ if (info->s->now_transactional)
+ {
+ TRN *trn;
+ struct st_my_thread_var *mysys_var= my_thread_var;
+ trn= trnman_new_trn(&mysys_var->mutex,
+ &mysys_var->suspend,
+ (char*) &mysys_var + STACK_DIRECTION *1024*128);
+ if (unlikely(!trn))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+ DBUG_PRINT("info", ("TRN set to 0x%lx", (ulong) trn));
+ info->trn= trn;
+ }
+ DBUG_RETURN(0);
+}
diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h
new file mode 100644
index 00000000000..2c57c73fd7a
--- /dev/null
+++ b/storage/maria/ma_commit.h
@@ -0,0 +1,18 @@
+/* Copyright (C) 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+C_MODE_START
+int ma_commit(TRN *trn);
+C_MODE_END
diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c
new file mode 100644
index 00000000000..3816830d9e1
--- /dev/null
+++ b/storage/maria/ma_control_file.c
@@ -0,0 +1,325 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3234 Maria control file
+ First version written by Guilhem Bichot on 2006-04-27.
+ Does not compile yet.
+*/
+
+#include "maria_def.h"
+
+/* Here is the implementation of this module */
+
+/*
+ a control file contains 3 objects: magic string, LSN of last checkpoint,
+ number of last log.
+*/
+
+/* total size should be < sector size for atomic write operation */
+#define CONTROL_FILE_MAGIC_STRING "\xfe\xfe\xc\1MACF"
+#define CONTROL_FILE_MAGIC_STRING_OFFSET 0
+#define CONTROL_FILE_MAGIC_STRING_SIZE (sizeof(CONTROL_FILE_MAGIC_STRING)-1)
+#define CONTROL_FILE_CHECKSUM_OFFSET (CONTROL_FILE_MAGIC_STRING_OFFSET + CONTROL_FILE_MAGIC_STRING_SIZE)
+#define CONTROL_FILE_CHECKSUM_SIZE 4
+#define CONTROL_FILE_LSN_OFFSET (CONTROL_FILE_CHECKSUM_OFFSET + CONTROL_FILE_CHECKSUM_SIZE)
+#define CONTROL_FILE_LSN_SIZE LSN_STORE_SIZE
+#define CONTROL_FILE_FILENO_OFFSET (CONTROL_FILE_LSN_OFFSET + CONTROL_FILE_LSN_SIZE)
+#define CONTROL_FILE_FILENO_SIZE 4
+#define CONTROL_FILE_SIZE (CONTROL_FILE_FILENO_OFFSET + CONTROL_FILE_FILENO_SIZE)
+
+/* This module owns these two vars. */
+/**
+ This LSN serves for the two-checkpoint rule, and also to find the
+ checkpoint record when doing a recovery.
+*/
+LSN last_checkpoint_lsn= LSN_IMPOSSIBLE;
+uint32 last_logno= FILENO_IMPOSSIBLE;
+
+/**
+ @brief If log's lock should be asserted when writing to control file.
+
+ Can be re-used by any function which needs to be thread-safe except when
+ it is called at startup.
+*/
+my_bool maria_multi_threaded= FALSE;
+/** @brief if currently doing a recovery */
+my_bool maria_in_recovery= FALSE;
+
+/*
+ Control file is less then 512 bytes (a disk sector),
+ to be as atomic as possible
+*/
+static int control_file_fd= -1;
+
+/*
+ @brief Initialize control file subsystem
+
+ Looks for the control file. If none and creation is requested, creates file.
+ If present, reads it to find out last checkpoint's LSN and last log, updates
+ the last_checkpoint_lsn and last_logno global variables.
+ Called at engine's start.
+
+ @note
+ The format of the control file is:
+ 4 bytes: magic string
+ 4 bytes: checksum of the following bytes
+ 4 bytes: number of log where last checkpoint is
+ 4 bytes: offset in log where last checkpoint is
+ 4 bytes: number of last log
+
+ @note If in recovery, file is not created
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error (in which case the file is left closed)
+*/
+CONTROL_FILE_ERROR ma_control_file_create_or_open()
+{
+ char buffer[CONTROL_FILE_SIZE];
+ char name[FN_REFLEN];
+ MY_STAT stat_buff;
+ my_bool create_file;
+ int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR;
+ int error= CONTROL_FILE_UNKNOWN_ERROR;
+ DBUG_ENTER("ma_control_file_create_or_open");
+
+ /*
+ If you change sizes in the #defines, you at least have to change the
+ "*store" and "*korr" calls in this file, and can even create backward
+ compatibility problems. Beware!
+ */
+ DBUG_ASSERT(CONTROL_FILE_LSN_SIZE == (3+4));
+ DBUG_ASSERT(CONTROL_FILE_FILENO_SIZE == 4);
+
+ if (control_file_fd >= 0) /* already open */
+ DBUG_RETURN(0);
+
+ if (fn_format(name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) == NullS)
+ DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+ create_file= test(my_access(name,F_OK));
+
+ if (create_file)
+ {
+ /* in a recovery, we expect to find a control file */
+ if (maria_in_recovery)
+ DBUG_RETURN(CONTROL_FILE_MISSING);
+ if ((control_file_fd= my_create(name, 0,
+ open_flags, MYF(MY_SYNC_DIR))) < 0)
+ DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+ /*
+ To be safer we should make sure that there are no logs or data/index
+ files around (indeed it could be that the control file alone was deleted
+ or not restored, and we should not go on with life at this point).
+
+ TODO: For now we trust (this is alpha version), but for beta if would
+ be great to verify.
+
+ We could have a tool which can rebuild the control file, by reading the
+ directory of logs, finding the newest log, reading it to find last
+ checkpoint... Slow but can save your db. For this to be possible, we
+ must always write to the control file right after writing the checkpoint
+ log record, and do nothing in between (i.e. the checkpoint must be
+ usable as soon as it has been written to the log).
+ */
+
+ /* init the file with these "undefined" values */
+ DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE,
+ FILENO_IMPOSSIBLE,
+ CONTROL_FILE_UPDATE_ALL));
+ }
+
+ /* Otherwise, file exists */
+
+ if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0)
+ goto err;
+
+ if (my_stat(name, &stat_buff, MYF(MY_WME)) == NULL)
+ goto err;
+
+ if ((uint)stat_buff.st_size < CONTROL_FILE_SIZE)
+ {
+ /*
+ Given that normally we write only a sector and it's atomic, the only
+ possibility for a file to be of too short size is if we crashed at the
+ very first startup, between file creation and file write. Quite unlikely
+ (and can be made even more unlikely by doing this: create a temp file,
+ write it, and then rename it to be the control file).
+ What's more likely is if someone forgot to restore the control file,
+ just did a "touch control" to try to get Maria to start, or if the
+ disk/filesystem has a problem.
+ So let's be rigid.
+ */
+ /*
+ TODO: store a message "too small file" somewhere, so that it goes to
+ MySQL's error log at startup.
+ */
+ error= CONTROL_FILE_TOO_SMALL;
+ goto err;
+ }
+
+ if ((uint)stat_buff.st_size > CONTROL_FILE_SIZE)
+ {
+ /* TODO: store "too big file" message */
+ error= CONTROL_FILE_TOO_BIG;
+ goto err;
+ }
+
+ if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE,
+ MYF(MY_FNABP | MY_WME)))
+ goto err;
+ if (memcmp(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
+ CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE))
+ {
+ /* TODO: store message "bad magic string" somewhere */
+ error= CONTROL_FILE_BAD_MAGIC_STRING;
+ goto err;
+ }
+ if (my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET,
+ CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET) !=
+ uint4korr(buffer + CONTROL_FILE_CHECKSUM_OFFSET))
+ {
+ /* TODO: store message "checksum mismatch" somewhere */
+ error= CONTROL_FILE_BAD_CHECKSUM;
+ goto err;
+ }
+ last_checkpoint_lsn= lsn_korr(buffer + CONTROL_FILE_LSN_OFFSET);
+ last_logno= uint4korr(buffer + CONTROL_FILE_FILENO_OFFSET);
+
+ DBUG_RETURN(0);
+err:
+ ma_control_file_end();
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Write information durably to the control file; stores this information into
+ the last_checkpoint_lsn and last_logno global variables.
+ Called when we have created a new log (after syncing this log's creation)
+ and when we have written a checkpoint (after syncing this log record).
+ Variables last_checkpoint_lsn and last_logno must be protected by caller
+ using log's lock, unless this function is called at startup.
+
+ SYNOPSIS
+ ma_control_file_write_and_force()
+ checkpoint_lsn LSN of last checkpoint
+ logno last log file number
+ objs_to_write which of the arguments should be used as new values
+ (for example, CONTROL_FILE_UPDATE_ONLY_LSN will not
+ write the logno argument to the control file and will
+ not update the last_logno global variable); can be:
+ CONTROL_FILE_UPDATE_ALL
+ CONTROL_FILE_UPDATE_ONLY_LSN
+ CONTROL_FILE_UPDATE_ONLY_LOGNO.
+
+ NOTE
+ We always want to do one single my_pwrite() here to be as atomic as
+ possible.
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno,
+ uint objs_to_write)
+{
+ char buffer[CONTROL_FILE_SIZE];
+ my_bool update_checkpoint_lsn= FALSE, update_logno= FALSE;
+ DBUG_ENTER("ma_control_file_write_and_force");
+
+ DBUG_ASSERT(control_file_fd >= 0); /* must be open */
+#ifndef DBUG_OFF
+ if (maria_multi_threaded)
+ translog_lock_assert_owner();
+#endif
+
+ memcpy(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
+ CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE);
+
+ if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LSN)
+ update_checkpoint_lsn= TRUE;
+ else if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LOGNO)
+ update_logno= TRUE;
+ else if (objs_to_write == CONTROL_FILE_UPDATE_ALL)
+ update_checkpoint_lsn= update_logno= TRUE;
+ else /* incorrect value of objs_to_write */
+ DBUG_ASSERT(0);
+
+ if (update_checkpoint_lsn)
+ lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, checkpoint_lsn);
+ else /* store old value == change nothing */
+ lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, last_checkpoint_lsn);
+
+ if (update_logno)
+ int4store(buffer + CONTROL_FILE_FILENO_OFFSET, logno);
+ else
+ int4store(buffer + CONTROL_FILE_FILENO_OFFSET, last_logno);
+
+ {
+ uint32 sum= (uint32)
+ my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET,
+ CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET);
+ int4store(buffer + CONTROL_FILE_CHECKSUM_OFFSET, sum);
+ }
+
+ if (my_pwrite(control_file_fd, buffer, sizeof(buffer),
+ 0, MYF(MY_FNABP | MY_WME)) ||
+ my_sync(control_file_fd, MYF(MY_WME)))
+ DBUG_RETURN(1);
+
+ if (update_checkpoint_lsn)
+ last_checkpoint_lsn= checkpoint_lsn;
+ if (update_logno)
+ last_logno= logno;
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Free resources taken by control file subsystem
+
+ SYNOPSIS
+ ma_control_file_end()
+*/
+
+int ma_control_file_end()
+{
+ int close_error;
+ DBUG_ENTER("ma_control_file_end");
+
+ if (control_file_fd < 0) /* already closed */
+ DBUG_RETURN(0);
+
+ close_error= my_close(control_file_fd, MYF(MY_WME));
+ /*
+ As my_close() frees structures even if close() fails, we do the same,
+ i.e. we mark the file as closed in all cases.
+ */
+ control_file_fd= -1;
+ /*
+ As this module owns these variables, closing the module forbids access to
+ them (just a safety):
+ */
+ last_checkpoint_lsn= LSN_IMPOSSIBLE;
+ last_logno= FILENO_IMPOSSIBLE;
+
+ DBUG_RETURN(close_error);
+}
diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h
new file mode 100644
index 00000000000..88a1780543a
--- /dev/null
+++ b/storage/maria/ma_control_file.h
@@ -0,0 +1,80 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3234 Maria control file
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+#ifndef _ma_control_file_h
+#define _ma_control_file_h
+
+#define CONTROL_FILE_BASE_NAME "maria_log_control"
+
+/* Here is the interface of this module */
+
+/*
+ LSN of the last checkoint
+ (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint)
+*/
+extern LSN last_checkpoint_lsn;
+/*
+ Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log
+ file yet)
+*/
+extern uint32 last_logno;
+
+extern my_bool maria_multi_threaded, maria_in_recovery;
+
+typedef enum enum_control_file_error {
+ CONTROL_FILE_OK= 0,
+ CONTROL_FILE_TOO_SMALL,
+ CONTROL_FILE_TOO_BIG,
+ CONTROL_FILE_BAD_MAGIC_STRING,
+ CONTROL_FILE_BAD_CHECKSUM,
+ CONTROL_FILE_MISSING,
+ CONTROL_FILE_UNKNOWN_ERROR /* any other error */
+} CONTROL_FILE_ERROR;
+
+#define CONTROL_FILE_UPDATE_ALL 0
+#define CONTROL_FILE_UPDATE_ONLY_LSN 1
+#define CONTROL_FILE_UPDATE_ONLY_LOGNO 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ Looks for the control file. If none and creation was requested, creates file.
+ If present, reads it to find out last checkpoint's LSN and last log.
+ Called at engine's start.
+*/
+CONTROL_FILE_ERROR ma_control_file_create_or_open();
+/*
+ Write information durably to the control file.
+ Called when we have created a new log (after syncing this log's creation)
+ and when we have written a checkpoint (after syncing this log record).
+*/
+int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno,
+ uint objs_to_write);
+
+
+/* Free resources taken by control file subsystem */
+int ma_control_file_end();
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c
new file mode 100644
index 00000000000..ba1d9a13b42
--- /dev/null
+++ b/storage/maria/ma_create.c
@@ -0,0 +1,1279 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Create a MARIA table */
+
+#include "ma_ftdefs.h"
+#include "ma_sp_defs.h"
+#include <my_bit.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h> /* Prototype for getpid */
+#endif
+#endif
+#include <m_ctype.h>
+
+static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b);
+
+/*
+ Old options is used when recreating database, from maria_chk
+*/
+
+int maria_create(const char *name, enum data_file_type datafile_type,
+ uint keys,MARIA_KEYDEF *keydefs,
+ uint columns, MARIA_COLUMNDEF *columndef,
+ uint uniques, MARIA_UNIQUEDEF *uniquedefs,
+ MARIA_CREATE_INFO *ci,uint flags)
+{
+ register uint i,j;
+ File dfile,file;
+ int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res;
+ myf create_flag;
+ uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff,
+ key_length,info_length,key_segs,options,min_key_length_skip,
+ base_pos,long_varchar_count,varchar_length,
+ unique_key_parts,fulltext_keys,offset, not_block_record_extra_length;
+ uint max_field_lengths, extra_header_size;
+ ulong reclength, real_reclength,min_pack_length;
+ char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr;
+ ulong pack_reclength;
+ ulonglong tot_length,max_rows, tmp;
+ enum en_fieldtype type;
+ enum data_file_type org_datafile_type= datafile_type;
+ MARIA_SHARE share;
+ MARIA_KEYDEF *keydef,tmp_keydef;
+ MARIA_UNIQUEDEF *uniquedef;
+ HA_KEYSEG *keyseg,tmp_keyseg;
+ MARIA_COLUMNDEF *column, *end_column;
+ ulong *rec_per_key_part;
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension;
+ MARIA_CREATE_INFO tmp_create_info;
+ my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
+ my_bool forced_packed;
+ myf sync_dir= 0;
+ uchar *log_data= NULL;
+ DBUG_ENTER("maria_create");
+ DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u",
+ keys, columns, uniques, flags));
+
+ DBUG_ASSERT(maria_block_size && maria_block_size % IO_SIZE == 0);
+ LINT_INIT(dfile);
+ LINT_INIT(file);
+
+ if (!ci)
+ {
+ bzero((char*) &tmp_create_info,sizeof(tmp_create_info));
+ ci=&tmp_create_info;
+ }
+
+ if (keys + uniques > MARIA_MAX_KEY || columns == 0)
+ {
+ DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION);
+ }
+ errpos=0;
+ options=0;
+ bzero((uchar*) &share,sizeof(share));
+
+ if (flags & HA_DONT_TOUCH_DATA)
+ {
+ org_datafile_type= ci->org_data_file_type;
+ if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD))
+ options=ci->old_options &
+ (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD |
+ HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM |
+ HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE);
+ else
+ {
+ /* Uncompressing rows */
+ options=ci->old_options &
+ (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE);
+ }
+ }
+
+ if (ci->reloc_rows > ci->max_rows)
+ ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */
+
+ if (!(rec_per_key_part=
+ (ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long),
+ MYF(MY_WME | MY_ZEROFILL))))
+ DBUG_RETURN(my_errno);
+
+ /* Start by checking fields and field-types used */
+
+ varchar_length=long_varchar_count=packed= not_block_record_extra_length=
+ pack_reclength= max_field_lengths= 0;
+ reclength= min_pack_length= ci->null_bytes;
+ forced_packed= 0;
+
+ for (column= columndef, end_column= column + columns ;
+ column != end_column ;
+ column++)
+ {
+ /* Fill in not used struct parts */
+ column->offset= reclength;
+ column->empty_pos= 0;
+ column->empty_bit= 0;
+ column->fill_length= column->length;
+ if (column->null_bit)
+ options|= HA_OPTION_NULL_FIELDS;
+
+ reclength+= column->length;
+ type= column->type;
+ if (type == FIELD_SKIP_PRESPACE && datafile_type == BLOCK_RECORD)
+ type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */
+
+ if (type != FIELD_NORMAL && type != FIELD_CHECK)
+ {
+ column->empty_pos= packed/8;
+ column->empty_bit= (1 << (packed & 7));
+ if (type == FIELD_BLOB)
+ {
+ forced_packed= 1;
+ packed++;
+ share.base.blobs++;
+ if (pack_reclength != INT_MAX32)
+ {
+ if (column->length == 4+portable_sizeof_char_ptr)
+ pack_reclength= INT_MAX32;
+ else
+ {
+ /* Add max possible blob length */
+ pack_reclength+= (1 << ((column->length-
+ portable_sizeof_char_ptr)*8));
+ }
+ }
+ max_field_lengths+= (column->length - portable_sizeof_char_ptr);
+ }
+ else if (type == FIELD_SKIP_PRESPACE ||
+ type == FIELD_SKIP_ENDSPACE)
+ {
+ forced_packed= 1;
+ max_field_lengths+= column->length > 255 ? 2 : 1;
+ not_block_record_extra_length++;
+ packed++;
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ varchar_length+= column->length-1; /* Used for min_pack_length */
+ pack_reclength++;
+ not_block_record_extra_length++;
+ max_field_lengths++;
+ packed++;
+ column->fill_length= 1;
+ /* We must test for 257 as length includes pack-length */
+ if (test(column->length >= 257))
+ {
+ long_varchar_count++;
+ max_field_lengths++;
+ column->fill_length= 2;
+ }
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ packed++;
+ else
+ {
+ if (!column->null_bit)
+ min_pack_length+= column->length;
+ else
+ not_block_record_extra_length+= column->length;
+ column->empty_pos= 0;
+ column->empty_bit= 0;
+ }
+ }
+ else /* FIELD_NORMAL */
+ {
+ if (!column->null_bit)
+ {
+ min_pack_length+= column->length;
+ share.base.fixed_not_null_fields++;
+ share.base.fixed_not_null_fields_length+= column->length;
+ }
+ else
+ not_block_record_extra_length+= column->length;
+ }
+ }
+
+ if (datafile_type == STATIC_RECORD && forced_packed)
+ {
+ /* Can't use fixed length records, revert to block records */
+ datafile_type= BLOCK_RECORD;
+ }
+
+ if (datafile_type == DYNAMIC_RECORD)
+ options|= HA_OPTION_PACK_RECORD; /* Must use packed records */
+
+ if (datafile_type == STATIC_RECORD)
+ {
+ /* We can't use checksum with static length rows */
+ flags&= ~HA_CREATE_CHECKSUM;
+ options&= ~HA_OPTION_CHECKSUM;
+ min_pack_length+= varchar_length;
+ packed= 0;
+ }
+ if (datafile_type != BLOCK_RECORD)
+ min_pack_length+= not_block_record_extra_length;
+
+ if ((packed & 7) == 1)
+ {
+ /*
+ Not optimal packing, try to remove a 1 uchar length zero-field as
+ this will get same record length, but smaller pack overhead
+ */
+ while (column != columndef)
+ {
+ column--;
+ if (column->type == (int) FIELD_SKIP_ZERO && column->length == 1)
+ {
+ /*
+ NOTE1: here we change a field type FIELD_SKIP_ZERO ->
+ FIELD_NORMAL
+ */
+ column->type=(int) FIELD_NORMAL;
+ column->empty_pos= 0;
+ column->empty_bit= 0;
+ packed--;
+ min_pack_length++;
+ break;
+ }
+ }
+ }
+
+ if (flags & HA_CREATE_TMP_TABLE)
+ {
+ options|= HA_OPTION_TMP_TABLE;
+ tmp_table= TRUE;
+ create_mode|= O_EXCL | O_NOFOLLOW;
+ /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */
+ ci->transactional= FALSE;
+ }
+ share.base.null_bytes= ci->null_bytes;
+ share.base.original_null_bytes= ci->null_bytes;
+ share.base.born_transactional= ci->transactional;
+ share.base.max_field_lengths= max_field_lengths;
+ share.base.field_offsets= 0; /* for future */
+
+ if (pack_reclength != INT_MAX32)
+ pack_reclength+= max_field_lengths + long_varchar_count;
+
+ if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM))
+ {
+ options|= HA_OPTION_CHECKSUM;
+ min_pack_length++;
+ pack_reclength++;
+ }
+ if (flags & HA_CREATE_DELAY_KEY_WRITE)
+ options|= HA_OPTION_DELAY_KEY_WRITE;
+ if (flags & HA_CREATE_RELIES_ON_SQL_LAYER)
+ options|= HA_OPTION_RELIES_ON_SQL_LAYER;
+
+ pack_bytes= (packed + 7) / 8;
+ if (pack_reclength != INT_MAX32)
+ pack_reclength+= reclength+pack_bytes +
+ test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_PACK_RECORD));
+ min_pack_length+= pack_bytes;
+ /* Calculate min possible row length for rows-in-block */
+ extra_header_size= MAX_FIXED_HEADER_SIZE;
+ if (ci->transactional)
+ {
+ extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE;
+ DBUG_PRINT("info",("creating a transactional table"));
+ }
+ share.base.min_row_length= (extra_header_size + share.base.null_bytes +
+ pack_bytes);
+ if (!ci->data_file_length && ci->max_rows)
+ {
+ if (pack_reclength == INT_MAX32 ||
+ (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength)
+ ci->data_file_length= ~(ulonglong) 0;
+ else
+ ci->data_file_length=(ulonglong) ci->max_rows*pack_reclength;
+ }
+ else if (!ci->max_rows)
+ {
+ if (datafile_type == BLOCK_RECORD)
+ {
+ uint rows_per_page= ((maria_block_size - PAGE_OVERHEAD_SIZE) /
+ (min_pack_length + extra_header_size +
+ DIR_ENTRY_SIZE));
+ ulonglong data_file_length= ci->data_file_length;
+ if (!data_file_length)
+ data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) *
+ 8)) -1));
+ if (rows_per_page > 0)
+ {
+ set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE);
+ ci->max_rows= data_file_length / maria_block_size * rows_per_page;
+ }
+ else
+ ci->max_rows= data_file_length / (min_pack_length +
+ extra_header_size +
+ DIR_ENTRY_SIZE);
+ }
+ else
+ ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length +
+ ((options &
+ HA_OPTION_PACK_RECORD) ?
+ 3 : 0)));
+ }
+ max_rows= (ulonglong) ci->max_rows;
+ if (datafile_type == BLOCK_RECORD)
+ {
+ /* The + 1 is for record position withing page */
+ pointer= maria_get_pointer_length((ci->data_file_length /
+ maria_block_size), 3) + 1;
+ set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE);
+
+ if (!max_rows)
+ max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) /
+ min_pack_length);
+ }
+ else
+ {
+ if (datafile_type != STATIC_RECORD)
+ pointer= maria_get_pointer_length(ci->data_file_length,
+ maria_data_pointer_size);
+ else
+ pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size);
+ if (!max_rows)
+ max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length);
+ }
+
+ real_reclength=reclength;
+ if (datafile_type == STATIC_RECORD)
+ {
+ if (reclength <= pointer)
+ reclength=pointer+1; /* reserve place for delete link */
+ }
+ else
+ reclength+= long_varchar_count; /* We need space for varchar! */
+
+ max_key_length=0; tot_length=0 ; key_segs=0;
+ fulltext_keys=0;
+ share.state.rec_per_key_part=rec_per_key_part;
+ share.state.key_root=key_root;
+ share.state.key_del= HA_OFFSET_ERROR;
+ if (uniques)
+ max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer;
+
+ for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++)
+ {
+ share.state.key_root[i]= HA_OFFSET_ERROR;
+ min_key_length_skip=length=real_length_diff=0;
+ key_length=pointer;
+ if (keydef->flag & HA_SPATIAL)
+ {
+#ifdef HAVE_SPATIAL
+ /* BAR TODO to support 3D and more dimensions in the future */
+ uint sp_segs=SPDIMS*2;
+ keydef->flag=HA_SPATIAL;
+
+ if (flags & HA_DONT_TOUCH_DATA)
+ {
+ /*
+ Called by maria_chk - i.e. table structure was taken from
+ MYI file and SPATIAL key *does have* additional sp_segs keysegs.
+ keydef->seg here points right at the GEOMETRY segment,
+ so we only need to decrease keydef->keysegs.
+ (see maria_recreate_table() in _ma_check.c)
+ */
+ keydef->keysegs-=sp_segs-1;
+ }
+
+ for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+ j++, keyseg++)
+ {
+ if (keyseg->type != HA_KEYTYPE_BINARY &&
+ keyseg->type != HA_KEYTYPE_VARBINARY1 &&
+ keyseg->type != HA_KEYTYPE_VARBINARY2)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ }
+ keydef->keysegs+=sp_segs;
+ key_length+=SPLEN*sp_segs;
+ length++; /* At least one length uchar */
+ min_key_length_skip+=SPLEN*2*SPDIMS;
+#else
+ my_errno= HA_ERR_UNSUPPORTED;
+ goto err_no_lock;
+#endif /*HAVE_SPATIAL*/
+ }
+ else if (keydef->flag & HA_FULLTEXT)
+ {
+ keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+
+ for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+ j++, keyseg++)
+ {
+ if (keyseg->type != HA_KEYTYPE_TEXT &&
+ keyseg->type != HA_KEYTYPE_VARTEXT1 &&
+ keyseg->type != HA_KEYTYPE_VARTEXT2)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ if (!(keyseg->flag & HA_BLOB_PART) &&
+ (keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARTEXT2))
+ {
+ /* Make a flag that this is a VARCHAR */
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ /* Store in bit_start number of bytes used to pack the length */
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)?
+ 1 : 2);
+ }
+ }
+
+ fulltext_keys++;
+ key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN;
+ length++; /* At least one length uchar */
+ min_key_length_skip+=HA_FT_MAXBYTELEN;
+ real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT;
+ }
+ else
+ {
+ /* Test if prefix compression */
+ if (keydef->flag & HA_PACK_KEY)
+ {
+ /* Can't use space_compression on number keys */
+ if ((keydef->seg[0].flag & HA_SPACE_PACK) &&
+ keydef->seg[0].type == (int) HA_KEYTYPE_NUM)
+ keydef->seg[0].flag&= ~HA_SPACE_PACK;
+
+ /* Only use HA_PACK_KEY when first segment is a variable length key */
+ if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART |
+ HA_VAR_LENGTH_PART)))
+ {
+ /* pack relative to previous key */
+ keydef->flag&= ~HA_PACK_KEY;
+ keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY;
+ }
+ else
+ {
+ keydef->seg[0].flag|=HA_PACK_KEY; /* for easyer intern test */
+ keydef->flag|=HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ }
+ }
+ if (keydef->flag & HA_BINARY_PACK_KEY)
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+
+ if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment)
+ share.base.auto_key=i+1;
+ for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++)
+ {
+ /* numbers are stored with high by first to make compression easier */
+ switch (keyseg->type) {
+ case HA_KEYTYPE_SHORT_INT:
+ case HA_KEYTYPE_LONG_INT:
+ case HA_KEYTYPE_FLOAT:
+ case HA_KEYTYPE_DOUBLE:
+ case HA_KEYTYPE_USHORT_INT:
+ case HA_KEYTYPE_ULONG_INT:
+ case HA_KEYTYPE_LONGLONG:
+ case HA_KEYTYPE_ULONGLONG:
+ case HA_KEYTYPE_INT24:
+ case HA_KEYTYPE_UINT24:
+ case HA_KEYTYPE_INT8:
+ keyseg->flag|= HA_SWAP_KEY;
+ break;
+ case HA_KEYTYPE_VARTEXT1:
+ case HA_KEYTYPE_VARTEXT2:
+ case HA_KEYTYPE_VARBINARY1:
+ case HA_KEYTYPE_VARBINARY2:
+ if (!(keyseg->flag & HA_BLOB_PART))
+ {
+ /* Make a flag that this is a VARCHAR */
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ /* Store in bit_start number of bytes used to pack the length */
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+ 1 : 2);
+ }
+ break;
+ default:
+ break;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ DBUG_ASSERT(!(keyseg->flag & HA_VAR_LENGTH_PART));
+ keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ length++; /* At least one length uchar */
+ min_key_length_skip+=keyseg->length;
+ if (keyseg->length >= 255)
+ { /* prefix may be 3 bytes */
+ min_key_length_skip+=2;
+ length+=2;
+ }
+ }
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ {
+ DBUG_ASSERT(!test_all_bits(keyseg->flag,
+ (HA_VAR_LENGTH_PART | HA_BLOB_PART)));
+ keydef->flag|=HA_VAR_LENGTH_KEY;
+ length++; /* At least one length uchar */
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ min_key_length_skip+=keyseg->length;
+ if (keyseg->length >= 255)
+ { /* prefix may be 3 bytes */
+ min_key_length_skip+=2;
+ length+=2;
+ }
+ }
+ key_length+= keyseg->length;
+ if (keyseg->null_bit)
+ {
+ key_length++;
+ options|=HA_OPTION_PACK_KEYS;
+ keyseg->flag|=HA_NULL_PART;
+ keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY;
+ }
+ }
+ } /* if HA_FULLTEXT */
+ key_segs+=keydef->keysegs;
+ if (keydef->keysegs > HA_MAX_KEY_SEG)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ /*
+ key_segs may be 0 in the case when we only want to be able to
+ add on row into the table. This can happen with some DISTINCT queries
+ in MySQL
+ */
+ if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME &&
+ key_segs)
+ share.state.rec_per_key_part[key_segs-1]=1L;
+ length+=key_length;
+ /*
+ A key can't be longer than than half a index block (as we have
+ to be able to put at least 2 keys on an index block for the key
+ algorithms to work).
+ */
+ if (length > maria_max_key_length())
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ keydef->block_length= maria_block_size;
+ keydef->keylength= (uint16) key_length;
+ keydef->minlength= (uint16) (length-min_key_length_skip);
+ keydef->maxlength= (uint16) length;
+
+ if (length > max_key_length)
+ max_key_length= length;
+ tot_length+= ((max_rows/(ulong) (((uint) maria_block_size-5)/
+ (length*2))) *
+ maria_block_size);
+ }
+
+ unique_key_parts=0;
+ offset=reclength-uniques*MARIA_UNIQUE_HASH_LENGTH;
+ for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++)
+ {
+ uniquedef->key=keys+i;
+ unique_key_parts+=uniquedef->keysegs;
+ share.state.key_root[keys+i]= HA_OFFSET_ERROR;
+ tot_length+= (max_rows/(ulong) (((uint) maria_block_size-5)/
+ ((MARIA_UNIQUE_HASH_LENGTH + pointer)*2)))*
+ (ulong) maria_block_size;
+ }
+ keys+=uniques; /* Each unique has 1 key */
+ key_segs+=uniques; /* Each unique has 1 key seg */
+
+ base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE +
+ key_segs * MARIA_STATE_KEYSEG_SIZE);
+ info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+
+ keys * MARIA_KEYDEF_SIZE+
+ uniques * MARIA_UNIQUEDEF_SIZE +
+ (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+
+ columns*MARIA_COLUMNDEF_SIZE);
+
+ DBUG_PRINT("info", ("info_length: %u", info_length));
+ /* There are only 16 bits for the total header length. */
+ if (info_length > 65535)
+ {
+ my_printf_error(0, "Maria table '%s' has too many columns and/or "
+ "indexes and/or unique constraints.",
+ MYF(0), name + dirname_length(name));
+ my_errno= HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+
+ bmove(share.state.header.file_version,(uchar*) maria_file_magic,4);
+ ci->old_options=options| (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ?
+ HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD: 0);
+ mi_int2store(share.state.header.options,ci->old_options);
+ mi_int2store(share.state.header.header_length,info_length);
+ mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE);
+ mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE);
+ mi_int2store(share.state.header.base_pos,base_pos);
+ share.state.header.data_file_type= share.data_file_type= datafile_type;
+ share.state.header.org_data_file_type= org_datafile_type;
+ share.state.header.language= (ci->language ?
+ ci->language : default_charset_info->number);
+
+ share.state.dellink = HA_OFFSET_ERROR;
+ share.state.first_bitmap_with_space= 0;
+ share.state.process= (ulong) getpid();
+ share.state.unique= (ulong) 0;
+ share.state.update_count=(ulong) 0;
+ share.state.version= (ulong) time((time_t*) 0);
+ share.state.sortkey= (ushort) ~0;
+ share.state.auto_increment=ci->auto_increment;
+ share.options=options;
+ share.base.rec_reflength=pointer;
+ share.base.block_size= maria_block_size;
+
+ /* Get estimate for index file length (this may be wrong for FT keys) */
+ tmp= (tot_length + maria_block_size * keys *
+ MARIA_INDEX_BLOCK_MARGIN) / maria_block_size;
+ /*
+ use maximum of key_file_length we calculated and key_file_length value we
+ got from MYI file header (see also mariapack.c:save_state)
+ */
+ share.base.key_reflength=
+ maria_get_pointer_length(max(ci->key_file_length,tmp),3);
+ share.base.keys= share.state.header.keys= keys;
+ share.state.header.uniques= uniques;
+ share.state.header.fulltext_keys= fulltext_keys;
+ mi_int2store(share.state.header.key_parts,key_segs);
+ mi_int2store(share.state.header.unique_key_parts,unique_key_parts);
+
+ maria_set_all_keys_active(share.state.key_map, keys);
+
+ share.base.keystart = share.state.state.key_file_length=
+ MY_ALIGN(info_length, maria_block_size);
+ share.base.max_key_block_length= maria_block_size;
+ share.base.max_key_length=ALIGN_SIZE(max_key_length+4);
+ share.base.records=ci->max_rows;
+ share.base.reloc= ci->reloc_rows;
+ share.base.reclength=real_reclength;
+ share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM);
+ share.base.max_pack_length=pack_reclength;
+ share.base.min_pack_length=min_pack_length;
+ share.base.pack_bytes= pack_bytes;
+ share.base.fields= columns;
+ share.base.pack_fields= packed;
+
+ if (share.data_file_type == BLOCK_RECORD)
+ {
+ /*
+ we are going to create a first bitmap page, set data_file_length
+ to reflect this, before the state goes to disk
+ */
+ share.state.state.data_file_length= maria_block_size;
+ /* Add length of packed fields + length */
+ share.base.pack_reclength+= share.base.max_field_lengths+3;
+
+ }
+
+ /* max_data_file_length and max_key_file_length are recalculated on open */
+ if (tmp_table)
+ share.base.max_data_file_length= (my_off_t) ci->data_file_length;
+ else if (ci->transactional && translog_inited && !maria_in_recovery)
+ {
+ /*
+ we have checked translog_inited above, because maria_chk may call us
+ (via maria_recreate_table()) and it does not have a log.
+ */
+ sync_dir= MY_SYNC_DIR;
+ }
+
+ if (datafile_type == BLOCK_RECORD)
+ share.base.min_block_length= share.base.min_row_length;
+ else
+ {
+ share.base.min_block_length=
+ (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH &&
+ ! share.base.blobs) ?
+ max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) :
+ MARIA_EXTEND_BLOCK_LENGTH;
+ }
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ share.state.create_time= (long) time((time_t*) 0);
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+
+ /*
+ NOTE: For test_if_reopen() we need a real path name. Hence we need
+ MY_RETURN_REAL_PATH for every fn_format(filename, ...).
+ */
+ if (ci->index_file_name)
+ {
+ char *iext= strrchr(ci->index_file_name, '.');
+ int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+ if (tmp_table)
+ {
+ char *path;
+ /* chop off the table name, tempory tables use generated name */
+ if ((path= strrchr(ci->index_file_name, FN_LIBCHAR)))
+ *path= '\0';
+ fn_format(filename, name, ci->index_file_name, MARIA_NAME_IEXT,
+ MY_REPLACE_DIR | MY_UNPACK_FILENAME |
+ MY_RETURN_REAL_PATH | MY_APPEND_EXT);
+ }
+ else
+ {
+ fn_format(filename, ci->index_file_name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+ (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ }
+ fn_format(linkname, name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ linkname_ptr= linkname;
+ /*
+ Don't create the table if the link or file exists to ensure that one
+ doesn't accidently destroy another table.
+ Don't sync dir now if the data file has the same path.
+ */
+ create_flag=
+ (ci->data_file_name &&
+ !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir;
+ }
+ else
+ {
+ char *iext= strrchr(name, '.');
+ int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+ fn_format(filename, name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+ (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ linkname_ptr= NullS;
+ /*
+ Replace the current file.
+ Don't sync dir now if the data file has the same path.
+ */
+ create_flag= MY_DELETE_OLD | (!ci->data_file_name ? 0 : sync_dir);
+ }
+
+ /*
+ If a MRG_MARIA table is in use, the mapped MARIA tables are open,
+ but no entry is made in the table cache for them.
+ A TRUNCATE command checks for the table in the cache only and could
+ be fooled to believe, the table is not open.
+ Pull the emergency brake in this situation. (Bug #8306)
+
+
+ NOTE: The filename is compared against unique_file_name of every
+ open table. Hence we need a real path here.
+ */
+ if (_ma_test_if_reopen(filename))
+ {
+ my_printf_error(0, "MARIA table '%s' is in use "
+ "(most likely by a MERGE table). Try FLUSH TABLES.",
+ MYF(0), name + dirname_length(name));
+ goto err;
+ }
+
+ if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME|create_flag))) < 0)
+ goto err;
+ errpos=1;
+
+ DBUG_PRINT("info", ("write state info and base info"));
+ if (_ma_state_info_write_sub(file, &share.state, 2) ||
+ _ma_base_info_write(file, &share.base))
+ goto err;
+ DBUG_PRINT("info", ("base_pos: %d base_info_size: %d",
+ base_pos, MARIA_BASE_INFO_SIZE));
+ DBUG_ASSERT(my_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE);
+
+ /* Write key and keyseg definitions */
+ DBUG_PRINT("info", ("write key and keyseg definitions"));
+ for (i=0 ; i < share.base.keys - uniques; i++)
+ {
+ uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0;
+
+ if (_ma_keydef_write(file, &keydefs[i]))
+ goto err;
+ for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++)
+ if (_ma_keyseg_write(file, &keydefs[i].seg[j]))
+ goto err;
+#ifdef HAVE_SPATIAL
+ for (j=0 ; j < sp_segs ; j++)
+ {
+ HA_KEYSEG sseg;
+ sseg.type=SPTYPE;
+ sseg.language= 7; /* Binary */
+ sseg.null_bit=0;
+ sseg.bit_start=0;
+ sseg.bit_end=0;
+ sseg.bit_length= 0;
+ sseg.bit_pos= 0;
+ sseg.length=SPLEN;
+ sseg.null_pos=0;
+ sseg.start=j*SPLEN;
+ sseg.flag= HA_SWAP_KEY;
+ if (_ma_keyseg_write(file, &sseg))
+ goto err;
+ }
+#endif
+ }
+ /* Create extra keys for unique definitions */
+ offset=reclength-uniques*MARIA_UNIQUE_HASH_LENGTH;
+ bzero((char*) &tmp_keydef,sizeof(tmp_keydef));
+ bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg));
+ for (i=0; i < uniques ; i++)
+ {
+ tmp_keydef.keysegs=1;
+ tmp_keydef.flag= HA_UNIQUE_CHECK;
+ tmp_keydef.block_length= (uint16) maria_block_size;
+ tmp_keydef.keylength= MARIA_UNIQUE_HASH_LENGTH + pointer;
+ tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength;
+ tmp_keyseg.type= MARIA_UNIQUE_HASH_TYPE;
+ tmp_keyseg.length= MARIA_UNIQUE_HASH_LENGTH;
+ tmp_keyseg.start= offset;
+ offset+= MARIA_UNIQUE_HASH_LENGTH;
+ if (_ma_keydef_write(file,&tmp_keydef) ||
+ _ma_keyseg_write(file,(&tmp_keyseg)))
+ goto err;
+ }
+
+ /* Save unique definition */
+ DBUG_PRINT("info", ("write unique definitions"));
+ for (i=0 ; i < share.state.header.uniques ; i++)
+ {
+ HA_KEYSEG *keyseg_end;
+ keyseg= uniquedefs[i].seg;
+ if (_ma_uniquedef_write(file, &uniquedefs[i]))
+ goto err;
+ for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs;
+ keyseg < keyseg_end;
+ keyseg++)
+ {
+ switch (keyseg->type) {
+ case HA_KEYTYPE_VARTEXT1:
+ case HA_KEYTYPE_VARTEXT2:
+ case HA_KEYTYPE_VARBINARY1:
+ case HA_KEYTYPE_VARBINARY2:
+ if (!(keyseg->flag & HA_BLOB_PART))
+ {
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+ 1 : 2);
+ }
+ break;
+ default:
+ DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0);
+ break;
+ }
+ if (_ma_keyseg_write(file, keyseg))
+ goto err;
+ }
+ }
+ DBUG_PRINT("info", ("write field definitions"));
+ if (datafile_type == BLOCK_RECORD)
+ {
+ /* Store columns in a more efficent order */
+ MARIA_COLUMNDEF **col_order, **pos;
+ if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields *
+ sizeof(MARIA_COLUMNDEF*),
+ MYF(MY_WME))))
+ goto err;
+ for (column= columndef, pos= col_order ;
+ column != end_column ;
+ column++, pos++)
+ *pos= column;
+ qsort(col_order, share.base.fields, sizeof(*col_order),
+ (qsort_cmp) compare_columns);
+ for (i=0 ; i < share.base.fields ; i++)
+ {
+ if (_ma_columndef_write(file, col_order[i]))
+ {
+ my_free((uchar*) col_order, MYF(0));
+ goto err;
+ }
+ }
+ my_free((uchar*) col_order, MYF(0));
+ }
+ else
+ {
+ for (i=0 ; i < share.base.fields ; i++)
+ if (_ma_columndef_write(file, &columndef[i]))
+ goto err;
+ }
+
+ if ((kfile_size_before_extension= my_tell(file,MYF(0))) == MY_FILEPOS_ERROR)
+ goto err;
+#ifndef DBUG_OFF
+ if (kfile_size_before_extension != info_length)
+ DBUG_PRINT("warning",("info_length: %u != used_length: %u",
+ info_length, (uint)kfile_size_before_extension));
+#endif
+
+ if (sync_dir)
+ {
+ /*
+ we log the first bytes and then the size to which we extend; this is
+ not log 1 KB of mostly zeroes if this is a small table.
+ */
+ char empty_string[]= "";
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ uint total_rec_length= 0;
+ uint i;
+ LSN lsn;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 +
+ kfile_size_before_extension;
+ /* we are needing maybe 64 kB, so don't use the stack */
+ log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0));
+ if ((log_data == NULL) ||
+ my_pread(file, 1 + 2 + 2 + log_data, kfile_size_before_extension,
+ 0, MYF(MY_NABP)))
+ goto err;
+ /*
+ remember if the data file was created or not, to know if Recovery can
+ do it or not, in the future
+ */
+ log_data[0]= test(flags & HA_DONT_TOUCH_DATA);
+ int2store(log_data + 1, kfile_size_before_extension);
+ int2store(log_data + 1 + 2, share.base.keystart);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)name;
+ /* we store the end-zero, for Recovery to just pass it to my_create() */
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length=
+ strlen(log_array[TRANSLOG_INTERNAL_PARTS + 0].str) + 1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data;
+ /* symlink description is also needed for re-creation by Recovery: */
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (char *)
+ (ci->data_file_name ? ci->data_file_name : empty_string);
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].length=
+ strlen(log_array[TRANSLOG_INTERNAL_PARTS + 2].str) + 1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (char *)
+ (ci->index_file_name ? ci->index_file_name : empty_string);
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].length=
+ strlen(log_array[TRANSLOG_INTERNAL_PARTS + 3].str) + 1;
+ for (i= TRANSLOG_INTERNAL_PARTS;
+ i < (sizeof(log_array)/sizeof(log_array[0])); i++)
+ total_rec_length+= log_array[i].length;
+ /**
+ For this record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe, which it is not now (that would require
+ work using the ddl_log of sql/sql_table.cc); when it is, we should
+ reconsider the moment of writing this log record (before or after op,
+ under THR_LOCK_maria or not...), how to use it in Recovery.
+ For now this record can serve when we apply logs to a backup,
+ so we sync it. This happens before the data file is created. If the
+ data file was created before, and we crashed before writing the log
+ record, at restart the table may be used, so we would not have a
+ trustable history in the log (impossible to apply this log to a
+ backup). The way we do it, if we crash before writing the log record
+ then there is no data file and the table cannot be used.
+ @todo Note that in case of TRUNCATE TABLE we also come here; for
+ Recovery to be able to finish TRUNCATE TABLE, instead of leaving a
+ half-truncated table, we should log the record at start of
+ maria_create(); for that we shouldn't write to the index file but to a
+ buffer (DYNAMIC_STRING), put the buffer into the record, then put the
+ buffer into the index file (so, change _ma_keydef_write() etc). That
+ would also enable Recovery to finish a CREATE TABLE. The final result
+ would be that we would be able to finish what the SQL layer has asked
+ for: it would be atomic.
+ When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not
+ called external_lock(), so have no TRN. It does not matter, as all
+ these operations are non-transactional and sync their files.
+ */
+ if (unlikely(translog_write_record(&lsn,
+ LOGREC_REDO_CREATE_TABLE,
+ &dummy_transaction_object, NULL,
+ total_rec_length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL) ||
+ translog_flush(lsn)))
+ goto err;
+ /*
+ store LSN into file, needed for Recovery to not be confused if a
+ DROP+CREATE happened (applying REDOs to the wrong table).
+ */
+ share.kfile.file= file;
+ if (_ma_update_create_rename_lsn_sub(&share, lsn, FALSE))
+ goto err;
+ my_free(log_data, MYF(0));
+ }
+
+ if (!(flags & HA_DONT_TOUCH_DATA))
+ {
+ if (ci->data_file_name)
+ {
+ char *dext= strrchr(ci->data_file_name, '.');
+ int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT);
+
+ if (tmp_table)
+ {
+ char *path;
+ /* chop off the table name, tempory tables use generated name */
+ if ((path= strrchr(ci->data_file_name, FN_LIBCHAR)))
+ *path= '\0';
+ fn_format(filename, name, ci->data_file_name, MARIA_NAME_DEXT,
+ MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ }
+ else
+ {
+ fn_format(filename, ci->data_file_name, "", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME |
+ (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ }
+ fn_format(linkname, name, "",MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= linkname;
+ create_flag=0;
+ }
+ else
+ {
+ fn_format(filename,name,"", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= NullS;
+ create_flag=MY_DELETE_OLD;
+ }
+ if ((dfile=
+ my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME | create_flag | sync_dir))) < 0)
+ goto err;
+ errpos=3;
+
+ if (_ma_initialize_data_file(&share, dfile))
+ goto err;
+ }
+
+ /* Enlarge files */
+ DBUG_PRINT("info", ("enlarge to keystart: %lu",
+ (ulong) share.base.keystart));
+ if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0)))
+ goto err;
+
+ if (sync_dir && my_sync(file, MYF(0)))
+ goto err;
+
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ {
+#ifdef USE_RELOC
+ if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0)))
+ goto err;
+#endif
+ if (sync_dir && my_sync(dfile, MYF(0)))
+ goto err;
+ if (my_close(dfile,MYF(0)))
+ goto err;
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ res= 0;
+ my_free((char*) rec_per_key_part,MYF(0));
+ errpos=0;
+ if (my_close(file,MYF(0)))
+ res= my_errno;
+ DBUG_RETURN(res);
+
+err:
+ pthread_mutex_unlock(&THR_LOCK_maria);
+
+err_no_lock:
+ save_errno=my_errno;
+ switch (errpos) {
+ case 3:
+ VOID(my_close(dfile,MYF(0)));
+ /* fall through */
+ case 2:
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ sync_dir);
+ /* fall through */
+ case 1:
+ VOID(my_close(file,MYF(0)));
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ sync_dir);
+ }
+ my_free(log_data, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((char*) rec_per_key_part, MYF(0));
+ DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */
+}
+
+
+uint maria_get_pointer_length(ulonglong file_length, uint def)
+{
+ DBUG_ASSERT(def >= 2 && def <= 7);
+ if (file_length) /* If not default */
+ {
+#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS
+ if (file_length >= (ULL(1) << 56))
+ def=8;
+ else
+#endif
+ if (file_length >= (ULL(1) << 48))
+ def=7;
+ else if (file_length >= (ULL(1) << 40))
+ def=6;
+ else if (file_length >= (ULL(1) << 32))
+ def=5;
+ else if (file_length >= (ULL(1) << 24))
+ def=4;
+ else if (file_length >= (ULL(1) << 16))
+ def=3;
+ else
+ def=2;
+ }
+ return def;
+}
+
+
+/*
+ Sort columns for records-in-block
+
+ IMPLEMENTATION
+ Sort columns in following order:
+
+ Fixed size, not null columns
+ Fixed length, null fields
+ Variable length fields (CHAR, VARCHAR)
+ Blobs
+
+ For same kind of fields, keep fields in original order
+*/
+
+static inline int sign(longlong a)
+{
+ return a < 0 ? -1 : (a > 0 ? 1 : 0);
+}
+
+
+static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr)
+{
+ MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr;
+ enum en_fieldtype a_type, b_type;
+
+ a_type= ((a->type == FIELD_NORMAL || a->type == FIELD_CHECK) ?
+ FIELD_NORMAL : a->type);
+ b_type= ((b->type == FIELD_NORMAL || b->type == FIELD_CHECK) ?
+ FIELD_NORMAL : b->type);
+
+ if (a_type == FIELD_NORMAL && !a->null_bit)
+ {
+ if (b_type != FIELD_NORMAL || b->null_bit)
+ return -1;
+ return sign((long) (a->offset - b->offset));
+ }
+ if (b_type == FIELD_NORMAL && !b->null_bit)
+ return 1;
+ if (a_type == b_type)
+ return sign((long) (a->offset - b->offset));
+ if (a_type == FIELD_NORMAL)
+ return -1;
+ if (b_type == FIELD_NORMAL)
+ return 1;
+ if (a_type == FIELD_BLOB)
+ return 1;
+ if (b_type == FIELD_BLOB)
+ return -1;
+ return sign((long) (a->offset - b->offset));
+}
+
+
+/* Initialize data file */
+
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile)
+{
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ share->bitmap.block_size= share->base.block_size;
+ share->bitmap.file.file = dfile;
+ return _ma_bitmap_create_first(share);
+ }
+ /*
+ So, in BLOCK_RECORD, a freshly created datafile is one page long; while in
+ other formats it is 0-byte long.
+ */
+ return 0;
+}
+
+
+/**
+ @brief Writes create_rename_lsn and is_of_horizon to disk, can force.
+
+ This is for special cases where:
+ - we don't want to write the full state to disk (so, not call
+ _ma_state_info_write()) because some parts of the state may be
+ currently inconsistent, or because it would be overkill
+ - we must sync these LSNs immediately for correctness.
+ It acquires intern_lock to protect the two LSNs and state write.
+
+ @param share table's share
+ @param do_sync if the write should be forced to disk
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+int _ma_update_create_rename_lsn(MARIA_SHARE *share,
+ LSN lsn, my_bool do_sync)
+{
+ int res;
+ pthread_mutex_lock(&share->intern_lock);
+ res= _ma_update_create_rename_lsn_sub(share, lsn, do_sync);
+ pthread_mutex_unlock(&share->intern_lock);
+ return res;
+}
+
+
+/**
+ @brief Writes create_rename_lsn and is_of_horizon to disk, can force.
+
+ Shortcut of _ma_update_create_rename_lsn() when we know that
+ intern_lock is not needed (when creating a table or opening it for the
+ first time).
+
+ @param share table's share
+ @param do_sync if the write should be forced to disk
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share,
+ LSN lsn, my_bool do_sync)
+{
+ char buf[LSN_STORE_SIZE*2], *ptr;
+ File file= share->kfile.file;
+ DBUG_ASSERT(file >= 0);
+ for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE)
+ lsn_store(ptr, lsn);
+ share->state.is_of_horizon= share->state.create_rename_lsn= lsn;
+ if (share->id != 0)
+ {
+ /*
+ If OP is the operation which is calling us, if table is later written,
+ we could see in the log:
+ FILE_ID ... REDO_OP ... REDO_INSERT.
+ (that can happen in real life at least with OP=REPAIR).
+ As FILE_ID will be ignored by Recovery because it is <
+ create_rename_lsn, REDO_INSERT would be ignored too, wrongly.
+ To avoid that, we force a LOGREC_FILE_ID to be logged at next write:
+ */
+ translog_deassign_id_from_share(share);
+ }
+ return my_pwrite(file, buf, sizeof(buf),
+ sizeof(share->state.header) + 2, MYF(MY_NABP)) ||
+ (do_sync && my_sync(file, MYF(0)));
+}
diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c
new file mode 100644
index 00000000000..a23e7248029
--- /dev/null
+++ b/storage/maria/ma_dbug.c
@@ -0,0 +1,193 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Support rutiner with are using with dbug */
+
+#include "maria_def.h"
+
+ /* Print a key in user understandable format */
+
+void _ma_print_key(FILE *stream, register HA_KEYSEG *keyseg,
+ const uchar *key, uint length)
+{
+ int flag;
+ short int s_1;
+ long int l_1;
+ float f_1;
+ double d_1;
+ const uchar *end;
+ const uchar *key_end= key + length;
+
+ VOID(fputs("Key: \"",stream));
+ flag=0;
+ for (; keyseg->type && key < key_end ;keyseg++)
+ {
+ if (flag++)
+ VOID(putc('-',stream));
+ end= key+ keyseg->length;
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */
+ if (! *(key++))
+ {
+ fprintf(stream,"NULL");
+ continue;
+ }
+ end++;
+ }
+
+ switch (keyseg->type) {
+ case HA_KEYTYPE_BINARY:
+ if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1)
+ { /* packed binary digit */
+ VOID(fprintf(stream,"%d",(uint) *key++));
+ break;
+ }
+ /* fall through */
+ case HA_KEYTYPE_TEXT:
+ case HA_KEYTYPE_NUM:
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ VOID(fprintf(stream,"%.*s",(int) *key,key+1));
+ key+= (int) *key+1;
+ }
+ else
+ {
+ VOID(fprintf(stream,"%.*s",(int) keyseg->length,key));
+ key=end;
+ }
+ break;
+ case HA_KEYTYPE_INT8:
+ VOID(fprintf(stream,"%d",(int) *((signed char*) key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ s_1= mi_sint2korr(key);
+ VOID(fprintf(stream,"%d",(int) s_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ {
+ ushort u_1;
+ u_1= mi_uint2korr(key);
+ VOID(fprintf(stream,"%u",(uint) u_1));
+ key=end;
+ break;
+ }
+ case HA_KEYTYPE_LONG_INT:
+ l_1=mi_sint4korr(key);
+ VOID(fprintf(stream,"%ld",l_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ l_1=mi_sint4korr(key);
+ VOID(fprintf(stream,"%lu",(ulong) l_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_INT24:
+ VOID(fprintf(stream,"%ld",(long) mi_sint3korr(key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_UINT24:
+ VOID(fprintf(stream,"%lu",(ulong) mi_uint3korr(key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ mi_float4get(f_1,key);
+ VOID(fprintf(stream,"%g",(double) f_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ mi_float8get(d_1,key);
+ VOID(fprintf(stream,"%g",d_1));
+ key=end;
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ {
+ char buff[21];
+ longlong2str(mi_sint8korr(key),buff,-10);
+ VOID(fprintf(stream,"%s",buff));
+ key=end;
+ break;
+ }
+ case HA_KEYTYPE_ULONGLONG:
+ {
+ char buff[21];
+ longlong2str(mi_sint8korr(key),buff,10);
+ VOID(fprintf(stream,"%s",buff));
+ key=end;
+ break;
+ }
+ case HA_KEYTYPE_BIT:
+ {
+ uint i;
+ fputs("0x",stream);
+ for (i=0 ; i < keyseg->length ; i++)
+ fprintf(stream, "%02x", (uint) *key++);
+ key= end;
+ break;
+ }
+
+#endif
+ case HA_KEYTYPE_VARTEXT1: /* VARCHAR and TEXT */
+ case HA_KEYTYPE_VARTEXT2: /* VARCHAR and TEXT */
+ case HA_KEYTYPE_VARBINARY1: /* VARBINARY and BLOB */
+ case HA_KEYTYPE_VARBINARY2: /* VARBINARY and BLOB */
+ {
+ uint tmp_length;
+ get_key_length(tmp_length,key);
+ /*
+ The following command sometimes gives a warning from valgrind.
+ Not yet sure if the bug is in valgrind, glibc or mysqld
+ */
+ VOID(fprintf(stream,"%.*s",(int) tmp_length,key));
+ key+=tmp_length;
+ break;
+ }
+ default: break; /* This never happens */
+ }
+ }
+ VOID(fputs("\"\n",stream));
+ return;
+} /* print_key */
+
+
+#ifdef EXTRA_DEBUG
+
+my_bool _ma_check_table_is_closed(const char *name, const char *where)
+{
+ char filename[FN_REFLEN];
+ LIST *pos;
+ DBUG_ENTER("_ma_check_table_is_closed");
+
+ (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32);
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info=(MARIA_HA*) pos->data;
+ MARIA_SHARE *share=info->s;
+ if (!strcmp(share->unique_file_name,filename))
+ {
+ if (share->last_version)
+ {
+ fprintf(stderr,"Warning: Table: %s is open on %s\n", name,where);
+ DBUG_PRINT("warning",("Table: %s is open on %s", name,where));
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ DBUG_RETURN(0);
+}
+#endif /* EXTRA_DEBUG */
diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c
new file mode 100644
index 00000000000..56da6fd3ed3
--- /dev/null
+++ b/storage/maria/ma_delete.c
@@ -0,0 +1,891 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Remove a row from a MARIA table */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+
+static int d_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uint comp_flag,
+ uchar *key,uint key_length,my_off_t page,uchar *anc_buff);
+static int del(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key,uchar *anc_buff,
+ my_off_t leaf_page,uchar *leaf_buff,uchar *keypos,
+ my_off_t next_block,uchar *ret_key);
+static int underflow(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *anc_buff,
+ my_off_t leaf_page,uchar *leaf_buff,uchar *keypos);
+static uint remove_key(MARIA_KEYDEF *keyinfo,uint nod_flag,uchar *keypos,
+ uchar *lastkey,uchar *page_end,
+ my_off_t *next_block);
+static int _ma_ck_real_delete(register MARIA_HA *info,MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length, my_off_t *root);
+
+
+int maria_delete(MARIA_HA *info,const uchar *record)
+{
+ uint i;
+ uchar *old_key;
+ int save_errno;
+ char lastpos[8];
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_delete");
+
+ /* Test if record is in datafile */
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ DBUG_EXECUTE_IF("my_error_test_undefined_error",
+ maria_print_error(info->s, INT_MAX);
+ DBUG_RETURN(my_errno= INT_MAX););
+ if (!(info->update & HA_STATE_AKTIV))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No database read */
+ }
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ if ((*share->compare_record)(info,record))
+ goto err; /* Error on read-check */
+
+ if (_ma_mark_file_changed(info))
+ goto err;
+
+ /* Remove all keys from the index file */
+
+ old_key= info->lastkey2;
+ for (i=0 ; i < share->base.keys ; i++ )
+ {
+ if (maria_is_key_active(info->s->state.key_map, i))
+ {
+ info->s->keyinfo[i].version++;
+ if (info->s->keyinfo[i].flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_del(info,i,(char*) old_key,record,info->cur_row.lastpos))
+ goto err;
+ }
+ else
+ {
+ if (info->s->keyinfo[i].ck_delete(info,i,old_key,
+ _ma_make_key(info,i,old_key,record,info->cur_row.lastpos)))
+ goto err;
+ }
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+ }
+ }
+
+ if ((*share->delete_record)(info, record))
+ goto err; /* Remove record from database */
+
+ /*
+ We can't use the row based checksum as this doesn't have enough
+ precision.
+ */
+ if (info->s->calc_checksum)
+ {
+ info->cur_row.checksum= (*info->s->calc_checksum)(info,record);
+ info->state->checksum-= info->cur_row.checksum;
+ }
+
+ info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED;
+ info->state->records-= !share->now_transactional;
+ share->state.changed|= STATE_NOT_OPTIMIZED_ROWS;
+
+ mi_sizestore(lastpos, info->cur_row.lastpos);
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (delete)", info->s->open_file_name));
+ (*info->invalidator)(info->s->open_file_name);
+ info->invalidator=0;
+ }
+ DBUG_RETURN(0);
+
+err:
+ save_errno=my_errno;
+ mi_sizestore(lastpos, info->cur_row.lastpos);
+ if (save_errno != HA_ERR_RECORD_CHANGED)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* mark table crashed */
+ }
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ info->update|=HA_STATE_WRITTEN; /* Buffer changed */
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ my_errno=save_errno;
+ if (save_errno == HA_ERR_KEY_NOT_FOUND)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ }
+
+ DBUG_RETURN(my_errno);
+} /* maria_delete */
+
+
+ /* Remove a key from the btree index */
+
+int _ma_ck_delete(register MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length)
+{
+ return _ma_ck_real_delete(info, info->s->keyinfo+keynr, key, key_length,
+ &info->s->state.key_root[keynr]);
+} /* _ma_ck_delete */
+
+
+static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length, my_off_t *root)
+{
+ int error;
+ uint nod_flag;
+ my_off_t old_root;
+ uchar *root_buff;
+ DBUG_ENTER("_ma_ck_real_delete");
+
+ if ((old_root=*root) == HA_OFFSET_ERROR)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno=HA_ERR_CRASHED);
+ }
+ if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ HA_MAX_KEY_BUFF*2)))
+ {
+ DBUG_PRINT("error",("Couldn't allocate memory"));
+ DBUG_RETURN(my_errno=ENOMEM);
+ }
+ DBUG_PRINT("info",("root_page: %ld", (long) old_root));
+ if (!_ma_fetch_keypage(info,keyinfo,old_root,DFLT_INIT_HITS,root_buff,0))
+ {
+ error= -1;
+ goto err;
+ }
+ if ((error=d_search(info,keyinfo,
+ (keyinfo->flag & HA_FULLTEXT ? SEARCH_FIND | SEARCH_UPDATE
+ : SEARCH_SAME),
+ key,key_length,old_root,root_buff)) >0)
+ {
+ if (error == 2)
+ {
+ DBUG_PRINT("test",("Enlarging of root when deleting"));
+ error= _ma_enlarge_root(info,keyinfo,key,root);
+ }
+ else /* error == 1 */
+ {
+ if (maria_data_on_page(root_buff) <= (nod_flag=_ma_test_if_nod(root_buff))+3)
+ {
+ error=0;
+ if (nod_flag)
+ *root= _ma_kpos(nod_flag,root_buff+2+nod_flag);
+ else
+ *root=HA_OFFSET_ERROR;
+ if (_ma_dispose(info,keyinfo,old_root,DFLT_INIT_HITS))
+ error= -1;
+ }
+ else
+ error= _ma_write_keypage(info,keyinfo,old_root,
+ DFLT_INIT_HITS,root_buff);
+ }
+ }
+err:
+ my_afree((uchar*) root_buff);
+ DBUG_PRINT("exit",("Return: %d",error));
+ DBUG_RETURN(error);
+} /* _ma_ck_real_delete */
+
+
+ /*
+ ** Remove key below key root
+ ** Return values:
+ ** 1 if there are less buffers; In this case anc_buff is not saved
+ ** 2 if there are more buffers
+ ** -1 on errors
+ */
+
+static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uint comp_flag, uchar *key, uint key_length,
+ my_off_t page, uchar *anc_buff)
+{
+ int flag,ret_value,save_flag;
+ uint length,nod_flag,search_key_length;
+ my_bool last_key;
+ uchar *leaf_buff,*keypos;
+ my_off_t leaf_page,next_block;
+ uchar lastkey[HA_MAX_KEY_BUFF];
+ DBUG_ENTER("d_search");
+ DBUG_DUMP("page",anc_buff,maria_data_on_page(anc_buff));
+
+ search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY;
+ flag=(*keyinfo->bin_search)(info,keyinfo,anc_buff,key, search_key_length,
+ comp_flag, &keypos, lastkey, &last_key);
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ {
+ DBUG_PRINT("error",("Found wrong key"));
+ DBUG_RETURN(-1);
+ }
+ nod_flag=_ma_test_if_nod(anc_buff);
+
+ if (!flag && keyinfo->flag & HA_FULLTEXT)
+ {
+ uint off;
+ int subkeys;
+
+ get_key_full_length_rdonly(off, lastkey);
+ subkeys=ft_sintXkorr(lastkey+off);
+ DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0);
+ comp_flag=SEARCH_SAME;
+ if (subkeys >= 0)
+ {
+ /* normal word, one-level tree structure */
+ if (info->ft1_to_ft2)
+ {
+ /* we're in ft1->ft2 conversion mode. Saving key data */
+ insert_dynamic(info->ft1_to_ft2, (char*) (lastkey+off));
+ }
+ else
+ {
+ /* we need exact match only if not in ft1->ft2 conversion mode */
+ flag=(*keyinfo->bin_search)(info,keyinfo,anc_buff,key,USE_WHOLE_KEY,
+ comp_flag, &keypos, lastkey, &last_key);
+ }
+ /* fall through to normal delete */
+ }
+ else
+ {
+ /* popular word. two-level tree. going down */
+ uint tmp_key_length;
+ my_off_t root;
+ uchar *kpos=keypos;
+
+ if (!(tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&kpos,lastkey)))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno= HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ root= _ma_dpos(info,nod_flag,kpos);
+ if (subkeys == -1)
+ {
+ /* the last entry in sub-tree */
+ if (_ma_dispose(info, keyinfo, root,DFLT_INIT_HITS))
+ DBUG_RETURN(-1);
+ /* fall through to normal delete */
+ }
+ else
+ {
+ keyinfo=&info->s->ft2_keyinfo;
+ kpos-=keyinfo->keylength+nod_flag; /* we'll modify key entry 'in vivo' */
+ get_key_full_length_rdonly(off, key);
+ key+=off;
+ ret_value= _ma_ck_real_delete(info, &info->s->ft2_keyinfo,
+ key, HA_FT_WLEN, &root);
+ _ma_dpointer(info, kpos+HA_FT_WLEN, root);
+ subkeys++;
+ ft_intXstore(kpos, subkeys);
+ if (!ret_value)
+ ret_value= _ma_write_keypage(info,keyinfo,page,
+ DFLT_INIT_HITS,anc_buff);
+ DBUG_PRINT("exit",("Return: %d",ret_value));
+ DBUG_RETURN(ret_value);
+ }
+ }
+ }
+ leaf_buff=0;
+ LINT_INIT(leaf_page);
+ if (nod_flag)
+ {
+ leaf_page= _ma_kpos(nod_flag,keypos);
+ if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ HA_MAX_KEY_BUFF*2)))
+ {
+ DBUG_PRINT("error",("Couldn't allocate memory"));
+ my_errno=ENOMEM;
+ DBUG_PRINT("exit",("Return: %d",-1));
+ DBUG_RETURN(-1);
+ }
+ if (!_ma_fetch_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff,0))
+ goto err;
+ }
+
+ if (flag != 0)
+ {
+ if (!nod_flag)
+ {
+ DBUG_PRINT("error",("Didn't find key"));
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED; /* This should newer happend */
+ goto err;
+ }
+ save_flag=0;
+ ret_value=d_search(info,keyinfo,comp_flag,key,key_length,
+ leaf_page,leaf_buff);
+ }
+ else
+ { /* Found key */
+ uint tmp;
+ length= maria_data_on_page(anc_buff);
+ if (!(tmp= remove_key(keyinfo,nod_flag,keypos,lastkey,anc_buff+length,
+ &next_block)))
+ goto err;
+
+ length-= tmp;
+
+ maria_putint(anc_buff,length,nod_flag);
+ if (!nod_flag)
+ { /* On leaf page */
+ if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff))
+ {
+ DBUG_PRINT("exit",("Return: %d",-1));
+ DBUG_RETURN(-1);
+ }
+ /* Page will be update later if we return 1 */
+ DBUG_RETURN(test(length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length)));
+ }
+ save_flag=1;
+ ret_value=del(info,keyinfo,key,anc_buff,leaf_page,leaf_buff,keypos,
+ next_block,lastkey);
+ }
+ if (ret_value >0)
+ {
+ save_flag=1;
+ if (ret_value == 1)
+ ret_value= underflow(info,keyinfo,anc_buff,leaf_page,leaf_buff,keypos);
+ else
+ { /* This happens only with packed keys */
+ DBUG_PRINT("test",("Enlarging of key when deleting"));
+ if (!_ma_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length))
+ goto err;
+ ret_value= _ma_insert(info,keyinfo,key,anc_buff,keypos,lastkey,
+ (uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0);
+ }
+ }
+ if (ret_value == 0 && maria_data_on_page(anc_buff) > keyinfo->block_length)
+ {
+ save_flag=1;
+ ret_value= _ma_split_page(info,keyinfo,key,anc_buff,lastkey,0) | 2;
+ }
+ if (save_flag && ret_value != 1)
+ ret_value|= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff);
+ else
+ {
+ DBUG_DUMP("page",anc_buff,maria_data_on_page(anc_buff));
+ }
+ my_afree(leaf_buff);
+ DBUG_PRINT("exit",("Return: %d",ret_value));
+ DBUG_RETURN(ret_value);
+
+err:
+ my_afree(leaf_buff);
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN (-1);
+} /* d_search */
+
+
+ /* Remove a key that has a page-reference */
+
+static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *key, uchar *anc_buff, my_off_t leaf_page,
+ uchar *leaf_buff,
+ uchar *keypos, /* Pos to where deleted key was */
+ my_off_t next_block,
+ uchar *ret_key) /* key before keypos in anc_buff */
+{
+ int ret_value,length;
+ uint a_length,nod_flag,tmp;
+ my_off_t next_page;
+ uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+ MARIA_SHARE *share=info->s;
+ MARIA_KEY_PARAM s_temp;
+ DBUG_ENTER("del");
+ DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx", (long) leaf_page,
+ (ulong) keypos));
+ DBUG_DUMP("leaf_buff",leaf_buff,maria_data_on_page(leaf_buff));
+
+ endpos= leaf_buff+ maria_data_on_page(leaf_buff);
+ if (!(key_start= _ma_get_last_key(info,keyinfo,leaf_buff,keybuff,endpos,
+ &tmp)))
+ DBUG_RETURN(-1);
+
+ if ((nod_flag=_ma_test_if_nod(leaf_buff)))
+ {
+ next_page= _ma_kpos(nod_flag,endpos);
+ if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ HA_MAX_KEY_BUFF*2)))
+ DBUG_RETURN(-1);
+ if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0))
+ ret_value= -1;
+ else
+ {
+ DBUG_DUMP("next_page",next_buff,maria_data_on_page(next_buff));
+ if ((ret_value=del(info,keyinfo,key,anc_buff,next_page,next_buff,
+ keypos,next_block,ret_key)) >0)
+ {
+ endpos=leaf_buff+maria_data_on_page(leaf_buff);
+ if (ret_value == 1)
+ {
+ ret_value=underflow(info,keyinfo,leaf_buff,next_page,
+ next_buff,endpos);
+ if (ret_value == 0 && maria_data_on_page(leaf_buff) > keyinfo->block_length)
+ {
+ ret_value= _ma_split_page(info,keyinfo,key,leaf_buff,ret_key,0) | 2;
+ }
+ }
+ else
+ {
+ DBUG_PRINT("test",("Inserting of key when deleting"));
+ if (!_ma_get_last_key(info,keyinfo,leaf_buff,keybuff,endpos,
+ &tmp))
+ goto err;
+ ret_value= _ma_insert(info,keyinfo,key,leaf_buff,endpos,keybuff,
+ (uchar*) 0,(uchar*) 0,(my_off_t) 0,0);
+ }
+ }
+ if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff))
+ goto err;
+ }
+ my_afree(next_buff);
+ DBUG_RETURN(ret_value);
+ }
+
+ /* Remove last key from leaf page */
+
+ maria_putint(leaf_buff,key_start-leaf_buff,nod_flag);
+ if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff))
+ goto err;
+
+ /* Place last key in ancestor page on deleted key position */
+
+ a_length= maria_data_on_page(anc_buff);
+ endpos=anc_buff+a_length;
+ if (keypos != anc_buff+2+share->base.key_reflength &&
+ !_ma_get_last_key(info,keyinfo,anc_buff,ret_key,keypos,&tmp))
+ goto err;
+ prev_key=(keypos == anc_buff+2+share->base.key_reflength ?
+ 0 : ret_key);
+ length=(*keyinfo->pack_key)(keyinfo,share->base.key_reflength,
+ keypos == endpos ? (uchar*) 0 : keypos,
+ prev_key, prev_key,
+ keybuff,&s_temp);
+ if (length > 0)
+ bmove_upp(endpos+length,endpos,(uint) (endpos-keypos));
+ else
+ bmove(keypos,keypos-length, (int) (endpos-keypos)+length);
+ (*keyinfo->store_key)(keyinfo,keypos,&s_temp);
+ /* Save pointer to next leaf */
+ if (!(*keyinfo->get_key)(keyinfo,share->base.key_reflength,&keypos,ret_key))
+ goto err;
+ _ma_kpointer(info,keypos - share->base.key_reflength,next_block);
+ maria_putint(anc_buff,a_length+length,share->base.key_reflength);
+
+ DBUG_RETURN( maria_data_on_page(leaf_buff) <=
+ (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length));
+err:
+ DBUG_RETURN(-1);
+} /* del */
+
+
+ /* Balances adjacent pages if underflow occours */
+
+static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *anc_buff,
+ my_off_t leaf_page,/* Ancestor page and underflow page */
+ uchar *leaf_buff,
+ uchar *keypos) /* Position to pos after key */
+{
+ int t_length;
+ uint length,anc_length,buff_length,leaf_length,p_length,s_length,nod_flag,
+ key_reflength,key_length;
+ my_off_t next_page;
+ uchar anc_key[HA_MAX_KEY_BUFF],leaf_key[HA_MAX_KEY_BUFF];
+ uchar *buff,*endpos,*next_keypos,*anc_pos,*half_pos,*temp_pos,*prev_key;
+ uchar *after_key;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("underflow");
+ DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx",(long) leaf_page,
+ (ulong) keypos));
+ DBUG_DUMP("anc_buff",anc_buff,maria_data_on_page(anc_buff));
+ DBUG_DUMP("leaf_buff",leaf_buff,maria_data_on_page(leaf_buff));
+
+ buff=info->buff;
+ info->keyread_buff_used=1;
+ next_keypos=keypos;
+ nod_flag=_ma_test_if_nod(leaf_buff);
+ p_length=nod_flag+2;
+ anc_length= maria_data_on_page(anc_buff);
+ leaf_length= maria_data_on_page(leaf_buff);
+ key_reflength=share->base.key_reflength;
+ if (info->s->keyinfo+info->lastinx == keyinfo)
+ info->page_changed=1;
+
+ if ((keypos < anc_buff+anc_length && (info->state->records & 1)) ||
+ keypos == anc_buff+2+key_reflength)
+ { /* Use page right of anc-page */
+ DBUG_PRINT("test",("use right page"));
+
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ {
+ if (!(next_keypos= _ma_get_key(info, keyinfo,
+ anc_buff, buff, keypos, &length)))
+ goto err;
+ }
+ else
+ {
+ /* Got to end of found key */
+ buff[0]=buff[1]=0; /* Avoid length error check if packed key */
+ if (!(*keyinfo->get_key)(keyinfo,key_reflength,&next_keypos,
+ buff))
+ goto err;
+ }
+ next_page= _ma_kpos(key_reflength,next_keypos);
+ if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0))
+ goto err;
+ buff_length= maria_data_on_page(buff);
+ DBUG_DUMP("next",buff,buff_length);
+
+ /* find keys to make a big key-page */
+ bmove(next_keypos-key_reflength, buff+2, key_reflength);
+ if (!_ma_get_last_key(info,keyinfo,anc_buff,anc_key,next_keypos,&length)
+ || !_ma_get_last_key(info,keyinfo,leaf_buff,leaf_key,
+ leaf_buff+leaf_length,&length))
+ goto err;
+
+ /* merge pages and put parting key from anc_buff between */
+ prev_key=(leaf_length == p_length ? (uchar*) 0 : leaf_key);
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,buff+p_length,
+ prev_key, prev_key,
+ anc_key, &s_temp);
+ length=buff_length-p_length;
+ endpos=buff+length+leaf_length+t_length;
+ /* buff will always be larger than before !*/
+ bmove_upp(endpos, buff+buff_length,length);
+ memcpy(buff, leaf_buff,(size_t) leaf_length);
+ (*keyinfo->store_key)(keyinfo,buff+leaf_length,&s_temp);
+ buff_length=(uint) (endpos-buff);
+ maria_putint(buff,buff_length,nod_flag);
+
+ /* remove key from anc_buff */
+
+ if (!(s_length=remove_key(keyinfo,key_reflength,keypos,anc_key,
+ anc_buff+anc_length,(my_off_t *) 0)))
+ goto err;
+
+ anc_length-=s_length;
+ maria_putint(anc_buff,anc_length,key_reflength);
+
+ if (buff_length <= keyinfo->block_length)
+ { /* Keys in one page */
+ memcpy(leaf_buff,buff,(size_t) buff_length);
+ if (_ma_dispose(info,keyinfo,next_page,DFLT_INIT_HITS))
+ goto err;
+ }
+ else
+ { /* Page is full */
+ endpos=anc_buff+anc_length;
+ DBUG_PRINT("test",("anc_buff: 0x%lx endpos: 0x%lx",
+ (long) anc_buff, (long) endpos));
+ if (keypos != anc_buff+2+key_reflength &&
+ !_ma_get_last_key(info,keyinfo,anc_buff,anc_key,keypos,&length))
+ goto err;
+ if (!(half_pos= _ma_find_half_pos(nod_flag, keyinfo, buff, leaf_key,
+ &key_length, &after_key)))
+ goto err;
+ length=(uint) (half_pos-buff);
+ memcpy(leaf_buff,buff,(size_t) length);
+ maria_putint(leaf_buff,length,nod_flag);
+
+ /* Correct new keypointer to leaf_page */
+ half_pos=after_key;
+ _ma_kpointer(info,leaf_key+key_length,next_page);
+ /* Save key in anc_buff */
+ prev_key=(keypos == anc_buff+2+key_reflength ? (uchar*) 0 : anc_key),
+ t_length=(*keyinfo->pack_key)(keyinfo,key_reflength,
+ (keypos == endpos ? (uchar*) 0 :
+ keypos),
+ prev_key, prev_key,
+ leaf_key, &s_temp);
+ if (t_length >= 0)
+ bmove_upp(endpos+t_length, endpos, (uint) (endpos-keypos));
+ else
+ bmove(keypos,keypos-t_length,(uint) (endpos-keypos)+t_length);
+ (*keyinfo->store_key)(keyinfo,keypos,&s_temp);
+ maria_putint(anc_buff,(anc_length+=t_length),key_reflength);
+
+ /* Store key first in new page */
+ if (nod_flag)
+ bmove(buff+2,half_pos-nod_flag,(size_t) nod_flag);
+ if (!(*keyinfo->get_key)(keyinfo,nod_flag,&half_pos,leaf_key))
+ goto err;
+ t_length=(int) (*keyinfo->pack_key)(keyinfo, nod_flag, (uchar*) 0,
+ (uchar*) 0, (uchar*) 0,
+ leaf_key, &s_temp);
+ /* t_length will always be > 0 for a new page !*/
+ length=(uint) ((buff+maria_data_on_page(buff))-half_pos);
+ bmove(buff+p_length+t_length, half_pos, (size_t) length);
+ (*keyinfo->store_key)(keyinfo,buff+p_length,&s_temp);
+ maria_putint(buff,length+t_length+p_length,nod_flag);
+
+ if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff))
+ goto err;
+ }
+ if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff))
+ goto err;
+ DBUG_RETURN(anc_length <= ((info->quick_mode ? MARIA_MIN_BLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length)));
+ }
+
+ DBUG_PRINT("test",("use left page"));
+
+ keypos= _ma_get_last_key(info,keyinfo,anc_buff,anc_key,keypos,&length);
+ if (!keypos)
+ goto err;
+ next_page= _ma_kpos(key_reflength,keypos);
+ if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0))
+ goto err;
+ buff_length= maria_data_on_page(buff);
+ endpos=buff+buff_length;
+ DBUG_DUMP("prev",buff,buff_length);
+
+ /* find keys to make a big key-page */
+ bmove(next_keypos - key_reflength, leaf_buff+2, key_reflength);
+ next_keypos=keypos;
+ if (!(*keyinfo->get_key)(keyinfo,key_reflength,&next_keypos,
+ anc_key))
+ goto err;
+ if (!_ma_get_last_key(info,keyinfo,buff,leaf_key,endpos,&length))
+ goto err;
+
+ /* merge pages and put parting key from anc_buff between */
+ prev_key=(leaf_length == p_length ? (uchar*) 0 : leaf_key);
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,
+ (leaf_length == p_length ?
+ (uchar*) 0 : leaf_buff+p_length),
+ prev_key, prev_key,
+ anc_key, &s_temp);
+ if (t_length >= 0)
+ bmove(endpos+t_length, leaf_buff+p_length,
+ (size_t) (leaf_length-p_length));
+ else /* We gained space */
+ bmove(endpos,leaf_buff+((int) p_length-t_length),
+ (size_t) (leaf_length-p_length+t_length));
+
+ (*keyinfo->store_key)(keyinfo,endpos,&s_temp);
+ buff_length=buff_length+leaf_length-p_length+t_length;
+ maria_putint(buff,buff_length,nod_flag);
+
+ /* remove key from anc_buff */
+ if (!(s_length= remove_key(keyinfo,key_reflength,keypos,anc_key,
+ anc_buff+anc_length,(my_off_t *) 0)))
+ goto err;
+
+ anc_length-=s_length;
+ maria_putint(anc_buff,anc_length,key_reflength);
+
+ if (buff_length <= keyinfo->block_length)
+ { /* Keys in one page */
+ if (_ma_dispose(info,keyinfo,leaf_page,DFLT_INIT_HITS))
+ goto err;
+ }
+ else
+ { /* Page is full */
+ if (keypos == anc_buff+2+key_reflength)
+ anc_pos=0; /* First key */
+ else if (!_ma_get_last_key(info,keyinfo,anc_buff,anc_pos=anc_key,keypos,
+ &length))
+ goto err;
+ endpos= _ma_find_half_pos(nod_flag,keyinfo,buff,leaf_key,
+ &key_length, &half_pos);
+ if (!endpos)
+ goto err;
+ _ma_kpointer(info,leaf_key+key_length,leaf_page);
+ /* Save key in anc_buff */
+ DBUG_DUMP("anc_buff",anc_buff,anc_length);
+ DBUG_DUMP("key_to_anc",leaf_key,key_length);
+
+ temp_pos=anc_buff+anc_length;
+ t_length=(*keyinfo->pack_key)(keyinfo,key_reflength,
+ keypos == temp_pos ? (uchar*) 0
+ : keypos,
+ anc_pos, anc_pos,
+ leaf_key,&s_temp);
+ if (t_length > 0)
+ bmove_upp(temp_pos+t_length, temp_pos, (uint) (temp_pos-keypos));
+ else
+ bmove(keypos,keypos-t_length,(uint) (temp_pos-keypos)+t_length);
+ (*keyinfo->store_key)(keyinfo,keypos,&s_temp);
+ maria_putint(anc_buff,(anc_length+=t_length),key_reflength);
+
+ /* Store first key on new page */
+ if (nod_flag)
+ bmove(leaf_buff+2,half_pos-nod_flag,(size_t) nod_flag);
+ if (!(length=(*keyinfo->get_key)(keyinfo,nod_flag,&half_pos,leaf_key)))
+ goto err;
+ DBUG_DUMP("key_to_leaf",leaf_key,length);
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, (uchar*) 0,
+ (uchar*) 0, (uchar*) 0, leaf_key, &s_temp);
+ length=(uint) ((buff+buff_length)-half_pos);
+ DBUG_PRINT("info",("t_length: %d length: %d",t_length,(int) length));
+ bmove(leaf_buff+p_length+t_length,half_pos,
+ (size_t) length);
+ (*keyinfo->store_key)(keyinfo,leaf_buff+p_length,&s_temp);
+ maria_putint(leaf_buff,length+t_length+p_length,nod_flag);
+ if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff))
+ goto err;
+ maria_putint(buff,endpos-buff,nod_flag);
+ }
+ if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff))
+ goto err;
+ DBUG_RETURN(anc_length <= (uint) keyinfo->block_length/2);
+
+err:
+ DBUG_RETURN(-1);
+} /* underflow */
+
+
+ /*
+ remove a key from packed buffert
+ The current code doesn't handle the case that the next key may be
+ packed better against the previous key if there is a case difference
+ returns how many chars was removed or 0 on error
+ */
+
+static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar *keypos, /* Where key starts */
+ uchar *lastkey, /* key to be removed */
+ uchar *page_end, /* End of page */
+ my_off_t *next_block) /* ptr to next block */
+{
+ int s_length;
+ uchar *start;
+ DBUG_ENTER("remove_key");
+ DBUG_PRINT("enter",("keypos: 0x%lx page_end: 0x%lx",(long) keypos, (long) page_end));
+
+ start=keypos;
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)))
+ {
+ s_length=(int) (keyinfo->keylength+nod_flag);
+ if (next_block && nod_flag)
+ *next_block= _ma_kpos(nod_flag,keypos+s_length);
+ }
+ else
+ { /* Let keypos point at next key */
+ /* Calculate length of key */
+ if (!(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey))
+ DBUG_RETURN(0); /* Error */
+
+ if (next_block && nod_flag)
+ *next_block= _ma_kpos(nod_flag,keypos);
+ s_length=(int) (keypos-start);
+ if (keypos != page_end)
+ {
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ {
+ uchar *old_key=start;
+ uint next_length,prev_length,prev_pack_length;
+ get_key_length(next_length,keypos);
+ get_key_pack_length(prev_length,prev_pack_length,old_key);
+ if (next_length > prev_length)
+ {
+ /* We have to copy data from the current key to the next key */
+ bmove_upp((char*) keypos,(char*) (lastkey+next_length),
+ (next_length-prev_length));
+ keypos-=(next_length-prev_length)+prev_pack_length;
+ store_key_length(keypos,prev_length);
+ s_length=(int) (keypos-start);
+ }
+ }
+ else
+ {
+ /* Check if a variable length first key part */
+ if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128)
+ {
+ /* Next key is packed against the current one */
+ uint next_length,prev_length,prev_pack_length,lastkey_length,
+ rest_length;
+ if (keyinfo->seg[0].length >= 127)
+ {
+ if (!(prev_length=mi_uint2korr(start) & 32767))
+ goto end;
+ next_length=mi_uint2korr(keypos) & 32767;
+ keypos+=2;
+ prev_pack_length=2;
+ }
+ else
+ {
+ if (!(prev_length= *start & 127))
+ goto end; /* Same key as previous*/
+ next_length= *keypos & 127;
+ keypos++;
+ prev_pack_length=1;
+ }
+ if (!(*start & 128))
+ prev_length=0; /* prev key not packed */
+ if (keyinfo->seg[0].flag & HA_NULL_PART)
+ lastkey++; /* Skip null marker */
+ get_key_length(lastkey_length,lastkey);
+ if (!next_length) /* Same key after */
+ {
+ next_length=lastkey_length;
+ rest_length=0;
+ }
+ else
+ get_key_length(rest_length,keypos);
+
+ if (next_length >= prev_length)
+ { /* Key after is based on deleted key */
+ uint pack_length,tmp;
+ bmove_upp((char*) keypos,(char*) (lastkey+next_length),
+ tmp=(next_length-prev_length));
+ rest_length+=tmp;
+ pack_length= prev_length ? get_pack_length(rest_length): 0;
+ keypos-=tmp+pack_length+prev_pack_length;
+ s_length=(int) (keypos-start);
+ if (prev_length) /* Pack against prev key */
+ {
+ *keypos++= start[0];
+ if (prev_pack_length == 2)
+ *keypos++= start[1];
+ store_key_length(keypos,rest_length);
+ }
+ else
+ {
+ /* Next key is not packed anymore */
+ if (keyinfo->seg[0].flag & HA_NULL_PART)
+ {
+ rest_length++; /* Mark not null */
+ }
+ if (prev_pack_length == 2)
+ {
+ mi_int2store(keypos,rest_length);
+ }
+ else
+ *keypos= rest_length;
+ }
+ }
+ }
+ }
+ }
+ }
+ end:
+ bmove(start, start+s_length, (uint) (page_end-start-s_length));
+ DBUG_RETURN((uint) s_length);
+} /* remove_key */
diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c
new file mode 100644
index 00000000000..8cb4fdb8a3e
--- /dev/null
+++ b/storage/maria/ma_delete_all.c
@@ -0,0 +1,161 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Remove all rows from a MARIA table */
+/* This clears the status information and truncates files */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+ @brief deletes all rows from a table
+
+ @param info Maria handler
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error
+*/
+
+int maria_delete_all_rows(MARIA_HA *info)
+{
+ MARIA_SHARE *share=info->s;
+ my_bool log_record;
+ DBUG_ENTER("maria_delete_all_rows");
+
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ /**
+ @todo LOCK take X-lock on table here.
+ When we have versioning, if some other thread is looking at this table,
+ we cannot shrink the file like this.
+ */
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ log_record= share->now_transactional && !share->temporary;
+ if (_ma_mark_file_changed(info))
+ goto err;
+
+ if (log_record)
+ {
+ /*
+ This record will be used by Recovery to finish the deletion if it
+ crashed. We force it because it's a non-undoable operation.
+ */
+ LSN lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[FILEID_STORE_SIZE];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL,
+ info->trn, info, 0,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data) ||
+ translog_flush(lsn)))
+ goto err;
+ }
+
+ /*
+ For recovery it matters that this is called after writing the log record,
+ so that resetting state.records actually happens under log's mutex.
+ */
+ _ma_reset_status(info);
+
+ /*
+ If we are using delayed keys or if the user has done changes to the tables
+ since it was locked then there may be key blocks in the page cache. Or
+ there may be data blocks there. We need to throw them away or they may
+ re-enter the emptied table later.
+ */
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX,
+ FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) ||
+ my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
+ my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)) )
+ goto err;
+
+ if (_ma_initialize_data_file(share, info->dfile.file))
+ goto err;
+
+ /*
+ The operations above on the index/data file will be forced to disk at
+ Checkpoint or maria_close() time. So we can reset:
+ */
+ info->trn->rec_lsn= LSN_IMPOSSIBLE;
+
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+#ifdef HAVE_MMAP
+ /* Resize mmaped area */
+ rw_wrlock(&info->s->mmap_lock);
+ _ma_remap_file(info, (my_off_t)0);
+ rw_unlock(&info->s->mmap_lock);
+#endif
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(0);
+
+err:
+ {
+ int save_errno=my_errno;
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ info->update|=HA_STATE_WRITTEN; /* Buffer changed */
+ /**
+ @todo RECOVERY if we come here, Recovery may later apply the REDO above,
+ which may be wrong. Not fixing it now, as anyway this way of deleting
+ rows will have to be re-examined when we have versioning.
+ */
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(my_errno=save_errno);
+ }
+} /* maria_delete_all_rows */
+
+
+/*
+ Reset status information
+
+ SYNOPSIS
+ _ma_reset_status()
+ maria Maria handler
+
+ DESCRIPTION
+ Resets data and index file information as if the file would be empty
+ Files are not touched.
+*/
+
+void _ma_reset_status(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_STATE_INFO *state= &share->state;
+ uint i;
+
+ info->state->records= info->state->del= state->split= 0;
+ state->changed= 0; /* File is optimized */
+ state->dellink= HA_OFFSET_ERROR;
+ state->sortkey= (ushort) ~0;
+ info->state->key_file_length= share->base.keystart;
+ info->state->data_file_length= 0;
+ info->state->empty= info->state->key_empty= 0;
+ /**
+ @todo RECOVERY BUG
+ the line below must happen under log's mutex when writing the REDO
+ */
+ info->state->checksum= 0;
+
+ /* Drop the delete key chain. */
+ state->key_del= HA_OFFSET_ERROR;
+ /* Clear all keys */
+ for (i=0 ; i < share->base.keys ; i++)
+ state->key_root[i]= HA_OFFSET_ERROR;
+}
diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c
new file mode 100644
index 00000000000..693c68c7e5f
--- /dev/null
+++ b/storage/maria/ma_delete_table.c
@@ -0,0 +1,111 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+ @brief drops (deletes) a table
+
+ @param name table's name
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error
+*/
+
+int maria_delete_table(const char *name)
+{
+ char from[FN_REFLEN];
+#ifdef USE_RAID
+ uint raid_type=0,raid_chunks=0;
+#endif
+ MARIA_HA *info;
+ myf sync_dir;
+ DBUG_ENTER("maria_delete_table");
+
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(name,"delete");
+#endif
+ /** @todo LOCK take X-lock on table */
+ /*
+ We need to know if this table is transactional.
+ When built with RAID support, we also need to determine if this table
+ makes use of the raid feature. If yes, we need to remove all raid
+ chunks. This is done with my_raid_delete(). Unfortunately it is
+ necessary to open the table just to check this. We use
+ 'open_for_repair' to be able to open even a crashed table. If even
+ this open fails, we assume no raid configuration for this table
+ and try to remove the normal data file only. This may however
+ leave the raid chunks behind.
+ */
+ if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR)))
+ {
+#ifdef USE_RAID
+ raid_type= 0;
+#endif
+ sync_dir= 0;
+ }
+ else
+ {
+#ifdef USE_RAID
+ raid_type= info->s->base.raid_type;
+ raid_chunks= info->s->base.raid_chunks;
+#endif
+ sync_dir= (info->s->now_transactional && !info->s->temporary &&
+ !maria_in_recovery) ?
+ MY_SYNC_DIR : 0;
+ maria_close(info);
+ }
+#ifdef USE_RAID
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(name,"delete");
+#endif
+#endif /* USE_RAID */
+
+ if (sync_dir)
+ {
+ /*
+ For this log record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe in DDLs.
+ For now this record can serve when we apply logs to a backup, so we sync
+ it.
+ */
+ LSN lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1;
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE,
+ &dummy_transaction_object, NULL,
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL) ||
+ translog_flush(lsn)))
+ DBUG_RETURN(1);
+ }
+
+ fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ if (my_delete_with_symlink(from, MYF(MY_WME | sync_dir)))
+ DBUG_RETURN(my_errno);
+ fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+ if (raid_type)
+ DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | sync_dir)) ?
+ my_errno : 0);
+#endif
+ DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | sync_dir)) ?
+ my_errno : 0);
+}
diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c
new file mode 100644
index 00000000000..6e13fbcecb6
--- /dev/null
+++ b/storage/maria/ma_dynrec.c
@@ -0,0 +1,1972 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Functions to handle space-packed-records and blobs
+
+ A row may be stored in one or more linked blocks.
+ The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH.
+ Each block is aligned on MARIA_DYN_ALIGN_SIZE.
+ The reson for the max block size is to not have too many different types
+ of blocks. For the differnet block types, look at _ma_get_block_info()
+*/
+
+#include "maria_def.h"
+
+static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record,
+ ulong reclength);
+static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos,
+ ulong *length);
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uchar *record, ulong reclength);
+static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos,
+ uint second_read);
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+ uint length);
+
+#ifdef THREAD
+/* Play it safe; We have a small stack when using threads */
+#undef my_alloca
+#undef my_afree
+#define my_alloca(A) my_malloc((A),MYF(0))
+#define my_afree(A) my_free((A),MYF(0))
+#endif
+
+ /* Interface function from MARIA_HA */
+
+#ifdef HAVE_MMAP
+
+/*
+ Create mmaped area for MARIA handler
+
+ SYNOPSIS
+ _ma_dynmap_file()
+ info MARIA handler
+
+ RETURN
+ 0 ok
+ 1 error.
+*/
+
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size)
+{
+ DBUG_ENTER("_ma_dynmap_file");
+ if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN)
+ {
+ DBUG_PRINT("warning", ("File is too large for mmap"));
+ DBUG_RETURN(1);
+ }
+ /*
+ Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page:
+ MAP_NORESERVE
+ Do not reserve swap space for this mapping. When swap space is
+ reserved, one has the guarantee that it is possible to modify the
+ mapping. When swap space is not reserved one might get SIGSEGV
+ upon a write if no physical memory is available.
+ */
+ info->s->file_map= (uchar*)
+ my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN),
+ info->s->mode==O_RDONLY ? PROT_READ :
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_NORESERVE,
+ info->dfile.file, 0L);
+ if (info->s->file_map == (uchar*) MAP_FAILED)
+ {
+ info->s->file_map= NULL;
+ DBUG_RETURN(1);
+ }
+#if defined(HAVE_MADVISE)
+ madvise(info->s->file_map, size, MADV_RANDOM);
+#endif
+ info->s->mmaped_length= size;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Resize mmaped area for MARIA handler
+
+ SYNOPSIS
+ _ma_remap_file()
+ info MARIA handler
+
+ RETURN
+*/
+
+void _ma_remap_file(MARIA_HA *info, my_off_t size)
+{
+ if (info->s->file_map)
+ {
+ VOID(my_munmap(info->s->file_map,
+ (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+ _ma_dynmap_file(info, size);
+ }
+}
+#endif
+
+
+/*
+ Read bytes from MySAM handler, using mmap or pread
+
+ SYNOPSIS
+ _ma_mmap_pread()
+ info MARIA handler
+ Buffer Input buffer
+ Count Count of bytes for read
+ offset Start position
+ MyFlags
+
+ RETURN
+ 0 ok
+*/
+
+uint _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags)
+{
+ DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file));
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->mmap_lock);
+
+ /*
+ The following test may fail in the following cases:
+ - We failed to remap a memory area (fragmented memory?)
+ - This thread has done some writes, but not yet extended the
+ memory mapped area.
+ */
+
+ if (info->s->mmaped_length >= offset + Count)
+ {
+ memcpy(Buffer, info->s->file_map + offset, Count);
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ return 0;
+ }
+ else
+ {
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+ }
+}
+
+
+ /* wrapper for my_pread in case if mmap isn't used */
+
+uint _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags)
+{
+ return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+/*
+ Write bytes to MySAM handler, using mmap or pwrite
+
+ SYNOPSIS
+ _ma_mmap_pwrite()
+ info MARIA handler
+ Buffer Output buffer
+ Count Count of bytes for write
+ offset Start position
+ MyFlags
+
+ RETURN
+ 0 ok
+ !=0 error. In this case return error from pwrite
+*/
+
+uint _ma_mmap_pwrite(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags)
+{
+ DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file));
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->mmap_lock);
+
+ /*
+ The following test may fail in the following cases:
+ - We failed to remap a memory area (fragmented memory?)
+ - This thread has done some writes, but not yet extended the
+ memory mapped area.
+ */
+
+ if (info->s->mmaped_length >= offset + Count)
+ {
+ memcpy(info->s->file_map + offset, Buffer, Count);
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ return 0;
+ }
+ else
+ {
+ info->s->nonmmaped_inserts++;
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+ }
+
+}
+
+
+ /* wrapper for my_pwrite in case if mmap isn't used */
+
+uint _ma_nommap_pwrite(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags)
+{
+ return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record)
+{
+ ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ record);
+ return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ reclength));
+}
+
+my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ record);
+ return (update_dynamic_record(info, pos,
+ info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ length));
+}
+
+
+my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record)
+{
+ uchar *rec_buff;
+ int error;
+ ulong reclength,reclength2,extra;
+
+ extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER+1);
+ reclength= (info->s->base.pack_reclength +
+ _ma_calc_total_blob_length(info,record)+ extra);
+ if (!(rec_buff=(uchar*) my_alloca(reclength)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+ return(1);
+ }
+ reclength2= _ma_rec_pack(info,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ record);
+ DBUG_PRINT("info",("reclength: %lu reclength2: %lu",
+ reclength, reclength2));
+ DBUG_ASSERT(reclength2 <= reclength);
+ error= write_dynamic_record(info,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ reclength2);
+ my_afree(rec_buff);
+ return(error != 0);
+}
+
+
+my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ uchar *rec_buff;
+ int error;
+ ulong reclength,extra;
+
+ extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER);
+ reclength= (info->s->base.pack_reclength+
+ _ma_calc_total_blob_length(info,record)+ extra);
+#ifdef NOT_USED /* We now support big rows */
+ if (reclength > MARIA_DYN_MAX_ROW_LENGTH)
+ {
+ my_errno=HA_ERR_TO_BIG_ROW;
+ return 1;
+ }
+#endif
+ if (!(rec_buff=(uchar*) my_alloca(reclength)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+ return(1);
+ }
+ reclength= _ma_rec_pack(info,rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ record);
+ error=update_dynamic_record(info,pos,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ reclength);
+ my_afree(rec_buff);
+ return(error != 0);
+}
+
+
+my_bool _ma_delete_dynamic_record(MARIA_HA *info,
+ const uchar *record __attribute__ ((unused)))
+{
+ return delete_dynamic_record(info, info->cur_row.lastpos, 0);
+}
+
+
+ /* Write record to data-file */
+
+static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record,
+ ulong reclength)
+{
+ int flag;
+ ulong length;
+ my_off_t filepos;
+ DBUG_ENTER("write_dynamic_record");
+
+ flag=0;
+ do
+ {
+ if (_ma_find_writepos(info,reclength,&filepos,&length))
+ goto err;
+ if (_ma_write_part_record(info,filepos,length,
+ (info->append_insert_at_end ?
+ HA_OFFSET_ERROR : info->s->state.dellink),
+ (uchar**) &record,&reclength,&flag))
+ goto err;
+ } while (reclength);
+
+ DBUG_RETURN(0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+ /* Get a block for data ; The given data-area must be used !! */
+
+static int _ma_find_writepos(MARIA_HA *info,
+ ulong reclength, /* record length */
+ my_off_t *filepos, /* Return file pos */
+ ulong *length) /* length of block at filepos */
+{
+ MARIA_BLOCK_INFO block_info;
+ ulong tmp;
+ DBUG_ENTER("_ma_find_writepos");
+
+ if (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end)
+ {
+ /* Deleted blocks exists; Get last used block */
+ *filepos=info->s->state.dellink;
+ block_info.second_read=0;
+ info->rec_cache.seek_not_done=1;
+ if (!(_ma_get_block_info(&block_info, info->dfile.file,
+ info->s->state.dellink) &
+ BLOCK_DELETED))
+ {
+ DBUG_PRINT("error",("Delete link crashed"));
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(-1);
+ }
+ info->s->state.dellink=block_info.next_filepos;
+ info->state->del--;
+ info->state->empty-= block_info.block_len;
+ *length= block_info.block_len;
+ }
+ else
+ {
+ /* No deleted blocks; Allocate a new block */
+ *filepos=info->state->data_file_length;
+ if ((tmp=reclength+3 + test(reclength >= (65520-3))) <
+ info->s->base.min_block_length)
+ tmp= info->s->base.min_block_length;
+ else
+ tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) &
+ (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)));
+ if (info->state->data_file_length >
+ (info->s->base.max_data_file_length - tmp))
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ DBUG_RETURN(-1);
+ }
+ if (tmp > MARIA_MAX_BLOCK_LENGTH)
+ tmp=MARIA_MAX_BLOCK_LENGTH;
+ *length= tmp;
+ info->state->data_file_length+= tmp;
+ info->s->state.split++;
+ info->update|=HA_STATE_WRITE_AT_END;
+ }
+ DBUG_RETURN(0);
+} /* _ma_find_writepos */
+
+
+
+/*
+ Unlink a deleted block from the deleted list.
+ This block will be combined with the preceding or next block to form
+ a big block.
+*/
+
+static bool unlink_deleted_block(MARIA_HA *info, MARIA_BLOCK_INFO *block_info)
+{
+ DBUG_ENTER("unlink_deleted_block");
+ if (block_info->filepos == info->s->state.dellink)
+ {
+ /* First deleted block; We can just use this ! */
+ info->s->state.dellink=block_info->next_filepos;
+ }
+ else
+ {
+ MARIA_BLOCK_INFO tmp;
+ tmp.second_read=0;
+ /* Unlink block from the previous block */
+ if (!(_ma_get_block_info(&tmp, info->dfile.file, block_info->prev_filepos)
+ & BLOCK_DELETED))
+ DBUG_RETURN(1); /* Something is wrong */
+ mi_sizestore(tmp.header+4,block_info->next_filepos);
+ if (info->s->file_write(info,(char*) tmp.header+4,8,
+ block_info->prev_filepos+4, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ /* Unlink block from next block */
+ if (block_info->next_filepos != HA_OFFSET_ERROR)
+ {
+ if (!(_ma_get_block_info(&tmp, info->dfile.file,
+ block_info->next_filepos)
+ & BLOCK_DELETED))
+ DBUG_RETURN(1); /* Something is wrong */
+ mi_sizestore(tmp.header+12,block_info->prev_filepos);
+ if (info->s->file_write(info,(char*) tmp.header+12,8,
+ block_info->next_filepos+12,
+ MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ }
+ }
+ /* We now have one less deleted block */
+ info->state->del--;
+ info->state->empty-= block_info->block_len;
+ info->s->state.split--;
+
+ /*
+ If this was a block that we where accessing through table scan
+ (maria_rrnd() or maria_scan(), then ensure that we skip over this block
+ when doing next maria_rrnd() or maria_scan().
+ */
+ if (info->cur_row.nextpos == block_info->filepos)
+ info->cur_row.nextpos+= block_info->block_len;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Add a backward link to delete block
+
+ SYNOPSIS
+ update_backward_delete_link()
+ info MARIA handler
+ delete_block Position to delete block to update.
+ If this is 'HA_OFFSET_ERROR', nothing will be done
+ filepos Position to block that 'delete_block' should point to
+
+ RETURN
+ 0 ok
+ 1 error. In this case my_error is set.
+*/
+
+static my_bool update_backward_delete_link(MARIA_HA *info,
+ my_off_t delete_block,
+ MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ DBUG_ENTER("update_backward_delete_link");
+
+ if (delete_block != HA_OFFSET_ERROR)
+ {
+ block_info.second_read=0;
+ if (_ma_get_block_info(&block_info, info->dfile.file, delete_block)
+ & BLOCK_DELETED)
+ {
+ char buff[8];
+ mi_sizestore(buff,filepos);
+ if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP)))
+ DBUG_RETURN(1); /* Error on write */
+ }
+ else
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1); /* Wrong delete link */
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+/* Delete datarecord from database */
+/* info->rec_cache.seek_not_done is updated in cmp_record */
+
+static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uint second_read)
+{
+ uint length,b_type;
+ MARIA_BLOCK_INFO block_info,del_block;
+ int error;
+ my_bool remove_next_block;
+ DBUG_ENTER("delete_dynamic_record");
+
+ /* First add a link from the last block to the new one */
+ error= update_backward_delete_link(info, info->s->state.dellink, filepos);
+
+ block_info.second_read=second_read;
+ do
+ {
+ /* Remove block at 'filepos' */
+ if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR) ||
+ (length=(uint) (block_info.filepos-filepos) +block_info.block_len) <
+ MARIA_MIN_BLOCK_LENGTH)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1);
+ }
+ /* Check if next block is a delete block */
+ del_block.second_read=0;
+ remove_next_block=0;
+ if (_ma_get_block_info(&del_block, info->dfile.file, filepos + length) &
+ BLOCK_DELETED && del_block.block_len+length <
+ MARIA_DYN_MAX_BLOCK_LENGTH)
+ {
+ /* We can't remove this yet as this block may be the head block */
+ remove_next_block=1;
+ length+=del_block.block_len;
+ }
+
+ block_info.header[0]=0;
+ mi_int3store(block_info.header+1,length);
+ mi_sizestore(block_info.header+4,info->s->state.dellink);
+ if (b_type & BLOCK_LAST)
+ bfill(block_info.header+12,8,255);
+ else
+ mi_sizestore(block_info.header+12,block_info.next_filepos);
+ if (info->s->file_write(info,(uchar*) block_info.header,20,filepos,
+ MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ info->s->state.dellink = filepos;
+ info->state->del++;
+ info->state->empty+=length;
+ filepos=block_info.next_filepos;
+
+ /* Now it's safe to unlink the deleted block directly after this one */
+ if (remove_next_block && unlink_deleted_block(info,&del_block))
+ error=1;
+ } while (!(b_type & BLOCK_LAST));
+
+ DBUG_RETURN(error);
+}
+
+
+ /* Write a block to datafile */
+
+int _ma_write_part_record(MARIA_HA *info,
+ my_off_t filepos, /* points at empty block */
+ ulong length, /* length of block */
+ my_off_t next_filepos,/* Next empty block */
+ uchar **record, /* pointer to record ptr */
+ ulong *reclength, /* length of *record */
+ int *flag) /* *flag == 0 if header */
+{
+ ulong head_length,res_length,extra_length,long_block,del_length;
+ uchar *pos,*record_end;
+ my_off_t next_delete_block;
+ uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER];
+ DBUG_ENTER("_ma_write_part_record");
+
+ next_delete_block=HA_OFFSET_ERROR;
+
+ res_length=extra_length=0;
+ if (length > *reclength + MARIA_SPLIT_LENGTH)
+ { /* Splitt big block */
+ res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH,
+ MARIA_DYN_ALIGN_SIZE);
+ length-= res_length; /* Use this for first part */
+ }
+ long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1;
+ if (length == *reclength+ 3 + long_block)
+ {
+ /* Block is exactly of the right length */
+ temp[0]=(uchar) (1+ *flag)+(uchar) long_block; /* Flag is 0 or 6 */
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ head_length=4;
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ head_length=3;
+ }
+ }
+ else if (length-long_block < *reclength+4)
+ { /* To short block */
+ if (next_filepos == HA_OFFSET_ERROR)
+ next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end ?
+ info->s->state.dellink : info->state->data_file_length);
+ if (*flag == 0) /* First block */
+ {
+ if (*reclength > MARIA_MAX_BLOCK_LENGTH)
+ {
+ head_length= 16;
+ temp[0]=13;
+ mi_int4store(temp+1,*reclength);
+ mi_int3store(temp+5,length-head_length);
+ mi_sizestore((uchar*) temp+8,next_filepos);
+ }
+ else
+ {
+ head_length=5+8+long_block*2;
+ temp[0]=5+(uchar) long_block;
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ mi_int3store(temp+4,length-head_length);
+ mi_sizestore((uchar*) temp+7,next_filepos);
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ mi_int2store(temp+3,length-head_length);
+ mi_sizestore((uchar*) temp+5,next_filepos);
+ }
+ }
+ }
+ else
+ {
+ head_length=3+8+long_block;
+ temp[0]=11+(uchar) long_block;
+ if (long_block)
+ {
+ mi_int3store(temp+1,length-head_length);
+ mi_sizestore((uchar*) temp+4,next_filepos);
+ }
+ else
+ {
+ mi_int2store(temp+1,length-head_length);
+ mi_sizestore((uchar*) temp+3,next_filepos);
+ }
+ }
+ }
+ else
+ { /* Block with empty info last */
+ head_length=4+long_block;
+ extra_length= length- *reclength-head_length;
+ temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ temp[4]= (uchar) (extra_length);
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ temp[3]= (uchar) (extra_length);
+ }
+ length= *reclength+head_length; /* Write only what is needed */
+ }
+ DBUG_DUMP("header",(uchar*) temp,head_length);
+
+ /* Make a long block for one write */
+ record_end= *record+length-head_length;
+ del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0);
+ bmove((uchar*) (*record-head_length),(uchar*) temp,head_length);
+ memcpy(temp,record_end,(size_t) (extra_length+del_length));
+ bzero((uchar*) record_end,extra_length);
+
+ if (res_length)
+ {
+ /* Check first if we can join this block with the next one */
+ MARIA_BLOCK_INFO del_block;
+ my_off_t next_block=filepos+length+extra_length+res_length;
+
+ del_block.second_read=0;
+ if (next_block < info->state->data_file_length &&
+ info->s->state.dellink != HA_OFFSET_ERROR)
+ {
+ if ((_ma_get_block_info(&del_block, info->dfile.file, next_block)
+ & BLOCK_DELETED) &&
+ res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH)
+ {
+ if (unlink_deleted_block(info,&del_block))
+ goto err;
+ res_length+=del_block.block_len;
+ }
+ }
+
+ /* Create a delete link of the last part of the block */
+ pos=record_end+extra_length;
+ pos[0]= '\0';
+ mi_int3store(pos+1,res_length);
+ mi_sizestore(pos+4,info->s->state.dellink);
+ bfill(pos+12,8,255); /* End link */
+ next_delete_block=info->s->state.dellink;
+ info->s->state.dellink= filepos+length+extra_length;
+ info->state->del++;
+ info->state->empty+=res_length;
+ info->s->state.split++;
+ }
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->update & HA_STATE_WRITE_AT_END)
+ {
+ if (info->update & HA_STATE_EXTEND_BLOCK)
+ {
+ info->update&= ~HA_STATE_EXTEND_BLOCK;
+ if (my_block_write(&info->rec_cache,(uchar*) *record-head_length,
+ length+extra_length+del_length,filepos))
+ goto err;
+ }
+ else if (my_b_write(&info->rec_cache,(uchar*) *record-head_length,
+ length+extra_length+del_length))
+ goto err;
+ }
+ else
+ {
+ info->rec_cache.seek_not_done=1;
+ if (info->s->file_write(info,(uchar*) *record-head_length,
+ length+extra_length+
+ del_length,filepos,info->s->write_flag))
+ goto err;
+ }
+ memcpy(record_end,temp,(size_t) (extra_length+del_length));
+ *record=record_end;
+ *reclength-=(length-head_length);
+ *flag=6;
+
+ if (del_length)
+ {
+ /* link the next delete block to this */
+ if (update_backward_delete_link(info, next_delete_block,
+ info->s->state.dellink))
+ goto err;
+ }
+
+ DBUG_RETURN(0);
+err:
+ DBUG_PRINT("exit",("errno: %d",my_errno));
+ DBUG_RETURN(1);
+} /* _ma_write_part_record */
+
+
+ /* update record from datafile */
+
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uchar *record, ulong reclength)
+{
+ int flag;
+ uint error;
+ ulong length;
+ MARIA_BLOCK_INFO block_info;
+ DBUG_ENTER("update_dynamic_record");
+
+ flag=block_info.second_read=0;
+ while (reclength > 0)
+ {
+ if (filepos != info->s->state.dellink)
+ {
+ block_info.next_filepos= HA_OFFSET_ERROR;
+ if ((error= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ DBUG_PRINT("error",("Got wrong block info"));
+ if (!(error & BLOCK_FATAL_ERROR))
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+ length=(ulong) (block_info.filepos-filepos) + block_info.block_len;
+ if (length < reclength)
+ {
+ uint tmp=MY_ALIGN(reclength - length + 3 +
+ test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE);
+ /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */
+ tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length;
+ /* Check if we can extend this block */
+ if (block_info.filepos + block_info.block_len ==
+ info->state->data_file_length &&
+ info->state->data_file_length <
+ info->s->base.max_data_file_length-tmp)
+ {
+ /* extend file */
+ DBUG_PRINT("info",("Extending file with %d bytes",tmp));
+ if (info->cur_row.nextpos == info->state->data_file_length)
+ info->cur_row.nextpos+= tmp;
+ info->state->data_file_length+= tmp;
+ info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK;
+ length+=tmp;
+ }
+ else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH)
+ {
+ /*
+ Check if next block is a deleted block
+ Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where
+ the next block is so small it can't be splited which could
+ casue problems
+ */
+
+ MARIA_BLOCK_INFO del_block;
+ del_block.second_read=0;
+ if (_ma_get_block_info(&del_block, info->dfile.file,
+ block_info.filepos + block_info.block_len) &
+ BLOCK_DELETED)
+ {
+ /* Use; Unlink it and extend the current block */
+ DBUG_PRINT("info",("Extending current block"));
+ if (unlink_deleted_block(info,&del_block))
+ goto err;
+ if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH)
+ {
+ /*
+ New block was too big, link overflow part back to
+ delete list
+ */
+ my_off_t next_pos;
+ ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH;
+ set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH);
+ next_pos= del_block.filepos+ del_block.block_len - rest_length;
+
+ if (update_backward_delete_link(info, info->s->state.dellink,
+ next_pos))
+ DBUG_RETURN(1);
+
+ /* create delete link for data that didn't fit into the page */
+ del_block.header[0]=0;
+ mi_int3store(del_block.header+1, rest_length);
+ mi_sizestore(del_block.header+4,info->s->state.dellink);
+ bfill(del_block.header+12,8,255);
+ if (info->s->file_write(info,(uchar*) del_block.header, 20,
+ next_pos, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ info->s->state.dellink= next_pos;
+ info->s->state.split++;
+ info->state->del++;
+ info->state->empty+= rest_length;
+ length-= rest_length;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (_ma_find_writepos(info,reclength,&filepos,&length))
+ goto err;
+ }
+ if (_ma_write_part_record(info,filepos,length,block_info.next_filepos,
+ &record,&reclength,&flag))
+ goto err;
+ if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR)
+ {
+ /* Start writing data on deleted blocks */
+ filepos=info->s->state.dellink;
+ }
+ }
+
+ if (block_info.next_filepos != HA_OFFSET_ERROR)
+ if (delete_dynamic_record(info,block_info.next_filepos,1))
+ goto err;
+ DBUG_RETURN(0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+ /* Pack a record. Return new reclength */
+
+uint _ma_rec_pack(MARIA_HA *info, register uchar *to,
+ register const uchar *from)
+{
+ uint length,new_length,flag,bit,i;
+ uchar *pos,*end,*startpos,*packpos;
+ enum en_fieldtype type;
+ reg3 MARIA_COLUMNDEF *column;
+ MARIA_BLOB *blob;
+ DBUG_ENTER("_ma_rec_pack");
+
+ flag= 0;
+ bit= 1;
+ startpos= packpos=to;
+ to+= info->s->base.pack_bytes;
+ blob= info->blobs;
+ column= info->s->columndef;
+ if (info->s->base.null_bytes)
+ {
+ memcpy(to, from, info->s->base.null_bytes);
+ from+= info->s->base.null_bytes;
+ to+= info->s->base.null_bytes;
+ }
+
+ for (i=info->s->base.fields ; i-- > 0; from+= length, column++)
+ {
+ length=(uint) column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+ {
+ if (type == FIELD_BLOB)
+ {
+ if (!blob->length)
+ flag|=bit;
+ else
+ {
+ char *temp_pos;
+ size_t tmp_length=length-portable_sizeof_char_ptr;
+ memcpy((uchar*) to,from,tmp_length);
+ memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*));
+ memcpy(to+tmp_length,temp_pos,(size_t) blob->length);
+ to+=tmp_length+blob->length;
+ }
+ blob++;
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ {
+ if (memcmp((uchar*) from, maria_zero_string, length) == 0)
+ flag|=bit;
+ else
+ {
+ memcpy((uchar*) to,from,(size_t) length); to+=length;
+ }
+ }
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ pos= (uchar*) from; end= (uchar*) from + length;
+ if (type == FIELD_SKIP_ENDSPACE)
+ { /* Pack trailing spaces */
+ while (end > from && *(end-1) == ' ')
+ end--;
+ }
+ else
+ { /* Pack pref-spaces */
+ while (pos < end && *pos == ' ')
+ pos++;
+ }
+ new_length=(uint) (end-pos);
+ if (new_length +1 + test(column->length > 255 && new_length > 127)
+ < length)
+ {
+ if (column->length > 255 && new_length > 127)
+ {
+ to[0]=(char) ((new_length & 127)+128);
+ to[1]=(char) (new_length >> 7);
+ to+=2;
+ }
+ else
+ *to++= (char) new_length;
+ memcpy((uchar*) to,pos,(size_t) new_length); to+=new_length;
+ flag|=bit;
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ }
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+ uint tmp_length;
+ if (pack_length == 1)
+ {
+ tmp_length= (uint) *(uchar*) from;
+ *to++= *from;
+ }
+ else
+ {
+ tmp_length= uint2korr(from);
+ store_key_length_inc(to,tmp_length);
+ }
+ memcpy(to, from+pack_length,tmp_length);
+ to+= tmp_length;
+ continue;
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ continue; /* Normal field */
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ *packpos++ = (char) (uchar) flag;
+ bit=1; flag=0;
+ }
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ }
+ }
+ if (bit != 1)
+ *packpos= (char) (uchar) flag;
+ if (info->s->calc_checksum)
+ *to++= (uchar) info->cur_row.checksum;
+ DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos)));
+ DBUG_RETURN((uint) (to-startpos));
+} /* _ma_rec_pack */
+
+
+
+/*
+ Check if a record was correctly packed. Used only by maria_chk
+ Returns 0 if record is ok.
+*/
+
+my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff,
+ ulong packed_length, my_bool with_checksum,
+ ha_checksum checksum)
+{
+ uint length,new_length,flag,bit,i;
+ uchar *pos,*end,*packpos,*to;
+ enum en_fieldtype type;
+ reg3 MARIA_COLUMNDEF *column;
+ DBUG_ENTER("_ma_rec_check");
+
+ packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes;
+ column= info->s->columndef;
+ flag= *packpos; bit=1;
+ record+= info->s->base.null_bytes;
+ to+= info->s->base.null_bytes;
+
+ for (i=info->s->base.fields ; i-- > 0; record+= length, column++)
+ {
+ length=(uint) column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+ {
+ if (type == FIELD_BLOB)
+ {
+ uint blob_length=
+ _ma_calc_blob_length(length-portable_sizeof_char_ptr,record);
+ if (!blob_length && !(flag & bit))
+ goto err;
+ if (blob_length)
+ to+=length - portable_sizeof_char_ptr+ blob_length;
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ {
+ if (memcmp((uchar*) record, maria_zero_string, length) == 0)
+ {
+ if (!(flag & bit))
+ goto err;
+ }
+ else
+ to+=length;
+ }
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ pos= (uchar*) record; end= (uchar*) record + length;
+ if (type == FIELD_SKIP_ENDSPACE)
+ { /* Pack trailing spaces */
+ while (end > record && *(end-1) == ' ')
+ end--;
+ }
+ else
+ { /* Pack pre-spaces */
+ while (pos < end && *pos == ' ')
+ pos++;
+ }
+ new_length=(uint) (end-pos);
+ if (new_length +1 + test(column->length > 255 && new_length > 127)
+ < length)
+ {
+ if (!(flag & bit))
+ goto err;
+ if (column->length > 255 && new_length > 127)
+ {
+ if (to[0] != (char) ((new_length & 127)+128) ||
+ to[1] != (char) (new_length >> 7))
+ goto err;
+ to+=2;
+ }
+ else if (*to++ != (char) new_length)
+ goto err;
+ to+=new_length;
+ }
+ else
+ to+=length;
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+ uint tmp_length;
+ if (pack_length == 1)
+ {
+ tmp_length= (uint) *(uchar*) record;
+ to+= 1+ tmp_length;
+ continue;
+ }
+ else
+ {
+ tmp_length= uint2korr(record);
+ to+= get_pack_length(tmp_length)+tmp_length;
+ }
+ continue;
+ }
+ else
+ {
+ to+=length;
+ continue; /* Normal field */
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ flag= *++packpos;
+ bit=1;
+ }
+ }
+ else
+ to+= length;
+ }
+ if (packed_length != (uint) (to - rec_buff) +
+ test(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1))))
+ goto err;
+ if (with_checksum && ((uchar) checksum != (uchar) *to))
+ {
+ DBUG_PRINT("error",("wrong checksum for row"));
+ goto err;
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Unpacks a record
+
+ @return Recordlength
+ @retval >0 ok
+ @retval MY_FILE_ERROR (== -1) Error.
+ my_errno is set to HA_ERR_WRONG_IN_RECORD
+*/
+
+ulong _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from,
+ ulong found_length)
+{
+ uint flag,bit,length,min_pack_length, column_length;
+ enum en_fieldtype type;
+ uchar *from_end,*to_end,*packpos;
+ reg3 MARIA_COLUMNDEF *column, *end_column;
+ DBUG_ENTER("_ma_rec_unpack");
+
+ to_end=to + info->s->base.reclength;
+ from_end=from+found_length;
+ flag= (uchar) *from; bit=1; packpos=from;
+ if (found_length < info->s->base.min_pack_length)
+ goto err;
+ from+= info->s->base.pack_bytes;
+ min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes;
+
+ if ((length= info->s->base.null_bytes))
+ {
+ memcpy(to, from, length);
+ from+= length;
+ to+= length;
+ min_pack_length-= length;
+ }
+
+ for (column= info->s->columndef, end_column= column + info->s->base.fields;
+ column < end_column ; to+= column_length, column++)
+ {
+ column_length= column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL &&
+ (type != FIELD_CHECK))
+ {
+ if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1);
+ if (pack_length == 1)
+ {
+ length= (uint) *(uchar*) from;
+ if (length > column_length-1)
+ goto err;
+ *to= *from++;
+ }
+ else
+ {
+ get_key_length(length, from);
+ if (length > column_length-2)
+ goto err;
+ int2store(to,length);
+ }
+ if (from+length > from_end)
+ goto err;
+ memcpy(to+pack_length, from, length);
+ from+= length;
+ min_pack_length--;
+ continue;
+ }
+ if (flag & bit)
+ {
+ if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO)
+ bzero((uchar*) to,column_length);
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ if (column->length > 255 && *from & 128)
+ {
+ if (from + 1 >= from_end)
+ goto err;
+ length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2;
+ }
+ else
+ {
+ if (from == from_end)
+ goto err;
+ length= (uchar) *from++;
+ }
+ min_pack_length--;
+ if (length >= column_length ||
+ min_pack_length + length > (uint) (from_end - from))
+ goto err;
+ if (type == FIELD_SKIP_ENDSPACE)
+ {
+ memcpy(to,(uchar*) from,(size_t) length);
+ bfill((uchar*) to+length,column_length-length,' ');
+ }
+ else
+ {
+ bfill((uchar*) to,column_length-length,' ');
+ memcpy(to+column_length-length,(uchar*) from,(size_t) length);
+ }
+ from+=length;
+ }
+ }
+ else if (type == FIELD_BLOB)
+ {
+ uint size_length=column_length- portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length,from);
+ ulong from_left= (ulong) (from_end - from);
+ if (from_left < size_length ||
+ from_left - size_length < blob_length ||
+ from_left - size_length - blob_length < min_pack_length)
+ goto err;
+ memcpy((uchar*) to,(uchar*) from,(size_t) size_length);
+ from+=size_length;
+ memcpy_fixed((uchar*) to+size_length,(uchar*) &from,sizeof(char*));
+ from+=blob_length;
+ }
+ else
+ {
+ if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE)
+ min_pack_length--;
+ if (min_pack_length + column_length > (uint) (from_end - from))
+ goto err;
+ memcpy(to,(uchar*) from,(size_t) column_length); from+=column_length;
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ flag= (uchar) *++packpos; bit=1;
+ }
+ }
+ else
+ {
+ if (min_pack_length > (uint) (from_end - from))
+ goto err;
+ min_pack_length-=column_length;
+ memcpy(to, (uchar*) from, (size_t) column_length);
+ from+=column_length;
+ }
+ }
+ if (info->s->calc_checksum)
+ info->cur_row.checksum= (uint) (uchar) *from++;
+ if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1))))
+ DBUG_RETURN(found_length);
+
+err:
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+ DBUG_PRINT("error",("to_end: 0x%lx -> 0x%lx from_end: 0x%lx -> 0x%lx",
+ (long) to, (long) to_end, (long) from, (long) from_end));
+ DBUG_DUMP("from",(uchar*) info->rec_buff,info->s->base.min_pack_length);
+ DBUG_RETURN(MY_FILE_ERROR);
+} /* _ma_rec_unpack */
+
+
+ /* Calc length of blob. Update info in blobs->length */
+
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record)
+{
+ ulong length;
+ MARIA_BLOB *blob,*end;
+
+ for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ;
+ blob != end;
+ blob++)
+ {
+ blob->length= _ma_calc_blob_length(blob->pack_length,record + blob->offset);
+ length+=blob->length;
+ }
+ return length;
+}
+
+
+ulong _ma_calc_blob_length(uint length, const uchar *pos)
+{
+ switch (length) {
+ case 1:
+ return (uint) (uchar) *pos;
+ case 2:
+ return (uint) uint2korr(pos);
+ case 3:
+ return uint3korr(pos);
+ case 4:
+ return uint4korr(pos);
+ default:
+ break;
+ }
+ return 0; /* Impossible */
+}
+
+
+void _ma_store_blob_length(uchar *pos,uint pack_length,uint length)
+{
+ switch (pack_length) {
+ case 1:
+ *pos= (uchar) length;
+ break;
+ case 2:
+ int2store(pos,length);
+ break;
+ case 3:
+ int3store(pos,length);
+ break;
+ case 4:
+ int4store(pos,length);
+ default:
+ break;
+ }
+ return;
+}
+
+
+/*
+ Read record from datafile.
+
+ SYNOPSIS
+ _ma_read_dynamic_record()
+ info MARIA_HA pointer to table.
+ filepos From where to read the record.
+ buf Destination for record.
+
+ NOTE
+ If a write buffer is active, it needs to be flushed if its contents
+ intersects with the record to read. We always check if the position
+ of the first uchar of the write buffer is lower than the position
+ past the last uchar to read. In theory this is also true if the write
+ buffer is completely below the read segment. That is, if there is no
+ intersection. But this case is unusual. We flush anyway. Only if the
+ first uchar in the write buffer is above the last uchar to read, we do
+ not flush.
+
+ A dynamic record may need several reads. So this check must be done
+ before every read. Reading a dynamic record starts with reading the
+ block header. If the record does not fit into the free space of the
+ header, the block may be longer than the header. In this case a
+ second read is necessary. These one or two reads repeat for every
+ part of the record.
+
+ RETURN
+ 0 OK
+ # Error number
+*/
+
+int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos)
+{
+ int block_of_record;
+ uint b_type;
+ MARIA_BLOCK_INFO block_info;
+ File file;
+ uchar *to;
+ uint left_length;
+ DBUG_ENTER("_ma_read_dynamic_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ goto err;
+
+ LINT_INIT(to);
+ LINT_INIT(left_length);
+ file= info->dfile.file;
+ block_of_record= 0; /* First block of record is numbered as zero. */
+ block_info.second_read= 0;
+ do
+ {
+ /* A corrupted table can have wrong pointers. (Bug# 19835) */
+ if (filepos == HA_OFFSET_ERROR)
+ goto panic;
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ (info->rec_cache.pos_in_file < filepos +
+ MARIA_BLOCK_INFO_HEADER_LENGTH) &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ info->rec_cache.seek_not_done=1;
+ if ((b_type= _ma_get_block_info(&block_info, file, filepos)) &
+ (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+ my_errno=HA_ERR_RECORD_DELETED;
+ goto err;
+ }
+ if (block_of_record++ == 0) /* First block */
+ {
+ if (block_info.rec_len > (uint) info->s->base.max_pack_length)
+ goto panic;
+ if (info->s->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+ goto err;
+ }
+ to= info->rec_buff;
+ left_length=block_info.rec_len;
+ }
+ if (left_length < block_info.data_len || ! block_info.data_len)
+ goto panic; /* Wrong linked record */
+ /* copy information that is already read */
+ {
+ uint offset= (uint) (block_info.filepos - filepos);
+ uint prefetch_len= (sizeof(block_info.header) - offset);
+ filepos+= sizeof(block_info.header);
+
+ if (prefetch_len > block_info.data_len)
+ prefetch_len= block_info.data_len;
+ if (prefetch_len)
+ {
+ memcpy((uchar*) to, block_info.header + offset, prefetch_len);
+ block_info.data_len-= prefetch_len;
+ left_length-= prefetch_len;
+ to+= prefetch_len;
+ }
+ }
+ /* read rest of record from file */
+ if (block_info.data_len)
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file < filepos + block_info.data_len &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ /*
+ What a pity that this method is not called 'file_pread' and that
+ there is no equivalent without seeking. We are at the right
+ position already. :(
+ */
+ if (info->s->file_read(info, (uchar*) to, block_info.data_len,
+ filepos, MYF(MY_NABP)))
+ goto panic;
+ left_length-=block_info.data_len;
+ to+=block_info.data_len;
+ }
+ filepos= block_info.next_filepos;
+ } while (left_length);
+
+ info->update|= HA_STATE_AKTIV; /* We have a aktive record */
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+ MY_FILE_ERROR ? 0 : my_errno);
+
+err:
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno);
+
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+}
+
+ /* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ uchar *old_rec_buff,*old_record;
+ my_off_t old_rec_buff_size;
+ my_bool error;
+ DBUG_ENTER("_ma_cmp_dynamic_unique");
+
+ if (!(old_record=my_alloca(info->s->base.reclength)))
+ DBUG_RETURN(1);
+
+ /* Don't let the compare destroy blobs that may be in use */
+ old_rec_buff= info->rec_buff;
+ old_rec_buff_size= info->rec_buff_size;
+
+ if (info->s->base.blobs)
+ info->rec_buff= 0;
+ error= _ma_read_dynamic_record(info, old_record, pos) != 0;
+ if (!error)
+ error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0;
+ if (info->s->base.blobs)
+ {
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->rec_buff= old_rec_buff;
+ info->rec_buff_size= old_rec_buff_size;
+ }
+ my_afree(old_record);
+ DBUG_RETURN(error);
+}
+
+
+ /* Compare of record on disk with packed record in memory */
+
+my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
+ register const uchar *record)
+{
+ uint flag, reclength, b_type,cmp_length;
+ my_off_t filepos;
+ uchar *buffer;
+ MARIA_BLOCK_INFO block_info;
+ my_bool error= 1;
+ DBUG_ENTER("_ma_cmp_dynamic_record");
+
+ /* We are going to do changes; dont let anybody disturb */
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK);
+ if (flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(1);
+ }
+ info->rec_cache.seek_not_done=1;
+
+ /* If nobody have touched the database we don't have to test rec */
+
+ buffer=info->rec_buff;
+ if ((info->opt_flag & READ_CHECK_USED))
+ { /* If check isn't disabled */
+ if (info->s->base.blobs)
+ {
+ if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+
+ _ma_calc_total_blob_length(info,record))))
+ DBUG_RETURN(1);
+ }
+ reclength= _ma_rec_pack(info,buffer,record);
+ record= buffer;
+
+ filepos= info->cur_row.lastpos;
+ flag=block_info.second_read=0;
+ block_info.next_filepos=filepos;
+ while (reclength > 0)
+ {
+ if ((b_type= _ma_get_block_info(&block_info, info->dfile.file,
+ block_info.next_filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ if (flag == 0) /* First block */
+ {
+ flag=1;
+ if (reclength != block_info.rec_len)
+ {
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ } else if (reclength < block_info.data_len)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+ reclength-= block_info.data_len;
+ cmp_length= block_info.data_len;
+ if (!reclength && info->s->calc_checksum)
+ cmp_length--; /* 'record' may not contain checksum */
+
+ if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos,
+ cmp_length))
+ {
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ flag=1;
+ record+=block_info.data_len;
+ }
+ }
+ my_errno=0;
+ error= 0;
+err:
+ if (buffer != info->rec_buff)
+ my_afree((uchar*) buffer);
+ DBUG_PRINT("exit", ("result: %d", error));
+ DBUG_RETURN(error);
+}
+
+
+ /* Compare file to buffert */
+
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+ uint length)
+{
+ uint next_length;
+ char temp_buff[IO_SIZE*2];
+ DBUG_ENTER("_ma_cmp_buffer");
+
+ next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1));
+
+ while (length > IO_SIZE*2)
+ {
+ if (my_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) ||
+ memcmp((uchar*) buff,temp_buff,next_length))
+ goto err;
+ filepos+=next_length;
+ buff+=next_length;
+ length-= next_length;
+ next_length=IO_SIZE*2;
+ }
+ if (my_pread(file,temp_buff,length,filepos,MYF(MY_NABP)))
+ goto err;
+ DBUG_RETURN(memcmp((uchar*) buff,temp_buff,length) != 0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Read next record from datafile during table scan.
+
+ SYNOPSIS
+ _ma_read_rnd_dynamic_record()
+ info MARIA_HA pointer to table.
+ buf Destination for record.
+ filepos From where to read the record.
+ skip_deleted_blocks If to repeat reading until a non-deleted
+ record is found.
+
+ NOTE
+ This is identical to _ma_read_dynamic_record(), except the following
+ cases:
+
+ - If there is no active row at 'filepos', continue scanning for
+ an active row. (This is becasue the previous
+ _ma_read_rnd_dynamic_record() call stored the next block position
+ in filepos, but this position may not be a start block for a row
+ - We may have READ_CACHING enabled, in which case we use the cache
+ to read rows.
+
+ For other comments, check _ma_read_dynamic_record()
+
+ RETURN
+ 0 OK
+ != 0 Error number
+*/
+
+int _ma_read_rnd_dynamic_record(MARIA_HA *info,
+ uchar *buf,
+ MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ int block_of_record, info_read;
+ uint left_len,b_type;
+ uchar *to;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_read_rnd_dynamic_record");
+
+ info_read=0;
+ LINT_INIT(to);
+
+ if (info->lock_type == F_UNLCK)
+ {
+#ifndef UNSAFE_LOCKING
+#else
+ info->tmp_lock_type=F_RDLCK;
+#endif
+ }
+ else
+ info_read=1; /* memory-keyinfoblock is ok */
+
+ block_of_record= 0; /* First block of record is numbered as zero. */
+ block_info.second_read= 0;
+ left_len=1;
+ do
+ {
+ if (filepos >= info->state->data_file_length)
+ {
+ if (!info_read)
+ { /* Check if changed */
+ info_read=1;
+ info->rec_cache.seek_not_done=1;
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ goto panic;
+ }
+ if (filepos >= info->state->data_file_length)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ goto err;
+ }
+ }
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache,(uchar*) block_info.header,filepos,
+ sizeof(block_info.header),
+ (!block_of_record && skip_deleted_blocks ?
+ READING_NEXT : 0) | READING_HEADER))
+ goto panic;
+ b_type= _ma_get_block_info(&block_info,-1,filepos);
+ }
+ else
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH &&
+ flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+ info->rec_cache.seek_not_done=1;
+ b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos);
+ }
+
+ if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ && skip_deleted_blocks)
+ {
+ filepos=block_info.filepos+block_info.block_len;
+ block_info.second_read=0;
+ continue; /* Search after next_record */
+ }
+ if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ {
+ my_errno= HA_ERR_RECORD_DELETED;
+ info->cur_row.lastpos= block_info.filepos;
+ info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+ }
+ goto err;
+ }
+ if (block_of_record == 0) /* First block */
+ {
+ if (block_info.rec_len > (uint) share->base.max_pack_length)
+ goto panic;
+ info->cur_row.lastpos= filepos;
+ if (share->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+ goto err;
+ }
+ to= info->rec_buff;
+ left_len=block_info.rec_len;
+ }
+ if (left_len < block_info.data_len)
+ goto panic; /* Wrong linked record */
+
+ /* copy information that is already read */
+ {
+ uint offset=(uint) (block_info.filepos - filepos);
+ uint tmp_length= (sizeof(block_info.header) - offset);
+ filepos=block_info.filepos;
+
+ if (tmp_length > block_info.data_len)
+ tmp_length= block_info.data_len;
+ if (tmp_length)
+ {
+ memcpy((uchar*) to, block_info.header+offset,tmp_length);
+ block_info.data_len-=tmp_length;
+ left_len-=tmp_length;
+ to+=tmp_length;
+ filepos+=tmp_length;
+ }
+ }
+ /* read rest of record from file */
+ if (block_info.data_len)
+ {
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache,(uchar*) to,filepos,
+ block_info.data_len,
+ (!block_of_record && skip_deleted_blocks) ?
+ READING_NEXT : 0))
+ goto panic;
+ }
+ else
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file <
+ block_info.filepos + block_info.data_len &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ /* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */
+ if (my_read(info->dfile.file, (uchar*)to, block_info.data_len,
+ MYF(MY_NABP)))
+ {
+ if (my_errno == -1)
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* Unexpected end of file */
+ goto err;
+ }
+ }
+ }
+ /*
+ Increment block-of-record counter. If it was the first block,
+ remember the position behind the block for the next call.
+ */
+ if (block_of_record++ == 0)
+ {
+ info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+ skip_deleted_blocks=0;
+ }
+ left_len-=block_info.data_len;
+ to+=block_info.data_len;
+ filepos=block_info.next_filepos;
+ } while (left_len);
+
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+ fast_ma_writeinfo(info);
+ if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+ MY_FILE_ERROR)
+ DBUG_RETURN(0);
+ DBUG_RETURN(my_errno); /* Wrong record */
+
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD; /* Something is fatal wrong */
+err:
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno);
+}
+
+
+ /* Read and process header from a dynamic-record-file */
+
+uint _ma_get_block_info(MARIA_BLOCK_INFO *info, File file, my_off_t filepos)
+{
+ uint return_val=0;
+ uchar *header=info->header;
+
+ if (file >= 0)
+ {
+ /*
+ We do not use my_pread() here because we want to have the file
+ pointer set to the end of the header after this function.
+ my_pread() may leave the file pointer untouched.
+ */
+ VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+ if (my_read(file,(char*) header,sizeof(info->header),MYF(0)) !=
+ sizeof(info->header))
+ goto err;
+ }
+ DBUG_DUMP("header",(uchar*) header,MARIA_BLOCK_INFO_HEADER_LENGTH);
+ if (info->second_read)
+ {
+ if (info->header[0] <= 6 || info->header[0] == 13)
+ return_val=BLOCK_SYNC_ERROR;
+ }
+ else
+ {
+ if (info->header[0] > 6 && info->header[0] != 13)
+ return_val=BLOCK_SYNC_ERROR;
+ }
+ info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */
+
+ switch (info->header[0]) {
+ case 0:
+ if ((info->block_len=(uint) mi_uint3korr(header+1)) <
+ MARIA_MIN_BLOCK_LENGTH ||
+ (info->block_len & (MARIA_DYN_ALIGN_SIZE -1)))
+ goto err;
+ info->filepos=filepos;
+ info->next_filepos=mi_sizekorr(header+4);
+ info->prev_filepos=mi_sizekorr(header+12);
+#if SIZEOF_OFF_T == 4
+ if ((mi_uint4korr(header+4) != 0 &&
+ (mi_uint4korr(header+4) != (ulong) ~0 ||
+ info->next_filepos != (ulong) ~0)) ||
+ (mi_uint4korr(header+12) != 0 &&
+ (mi_uint4korr(header+12) != (ulong) ~0 ||
+ info->prev_filepos != (ulong) ~0)))
+ goto err;
+#endif
+ return return_val | BLOCK_DELETED; /* Deleted block */
+
+ case 1:
+ info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->filepos=filepos+3;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+ case 2:
+ info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->filepos=filepos+4;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+ case 13:
+ info->rec_len=mi_uint4korr(header+1);
+ info->block_len=info->data_len=mi_uint3korr(header+5);
+ info->next_filepos=mi_sizekorr(header+8);
+ info->second_read=1;
+ info->filepos=filepos+16;
+ return return_val | BLOCK_FIRST;
+
+ case 3:
+ info->rec_len=info->data_len=mi_uint2korr(header+1);
+ info->block_len=info->rec_len+ (uint) header[3];
+ info->filepos=filepos+4;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+ case 4:
+ info->rec_len=info->data_len=mi_uint3korr(header+1);
+ info->block_len=info->rec_len+ (uint) header[4];
+ info->filepos=filepos+5;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+ case 5:
+ info->rec_len=mi_uint2korr(header+1);
+ info->block_len=info->data_len=mi_uint2korr(header+3);
+ info->next_filepos=mi_sizekorr(header+5);
+ info->second_read=1;
+ info->filepos=filepos+13;
+ return return_val | BLOCK_FIRST;
+ case 6:
+ info->rec_len=mi_uint3korr(header+1);
+ info->block_len=info->data_len=mi_uint3korr(header+4);
+ info->next_filepos=mi_sizekorr(header+7);
+ info->second_read=1;
+ info->filepos=filepos+15;
+ return return_val | BLOCK_FIRST;
+
+ /* The following blocks are identical to 1-6 without rec_len */
+ case 7:
+ info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->filepos=filepos+3;
+ return return_val | BLOCK_LAST;
+ case 8:
+ info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->filepos=filepos+4;
+ return return_val | BLOCK_LAST;
+
+ case 9:
+ info->data_len=mi_uint2korr(header+1);
+ info->block_len=info->data_len+ (uint) header[3];
+ info->filepos=filepos+4;
+ return return_val | BLOCK_LAST;
+ case 10:
+ info->data_len=mi_uint3korr(header+1);
+ info->block_len=info->data_len+ (uint) header[4];
+ info->filepos=filepos+5;
+ return return_val | BLOCK_LAST;
+
+ case 11:
+ info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->next_filepos=mi_sizekorr(header+3);
+ info->second_read=1;
+ info->filepos=filepos+11;
+ return return_val;
+ case 12:
+ info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->next_filepos=mi_sizekorr(header+4);
+ info->second_read=1;
+ info->filepos=filepos+12;
+ return return_val;
+ }
+
+err:
+ my_errno=HA_ERR_WRONG_IN_RECORD; /* Garbage */
+ return BLOCK_ERROR;
+}
diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c
new file mode 100644
index 00000000000..4f1634756ab
--- /dev/null
+++ b/storage/maria/ma_extra.c
@@ -0,0 +1,623 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#include "ma_blockrec.h"
+
+static void maria_extra_keyflag(MARIA_HA *info,
+ enum ha_extra_function function);
+
+/**
+ @brief Set options and buffers to optimize table handling
+
+ @param name table's name
+ @param info open table
+ @param function operation
+ @param extra_arg Pointer to extra argument (normally pointer to
+ ulong); used when function is one of:
+ HA_EXTRA_WRITE_CACHE
+ HA_EXTRA_CACHE
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int maria_extra(MARIA_HA *info, enum ha_extra_function function,
+ void *extra_arg)
+{
+ int error=0;
+ ulong cache_size;
+ MARIA_SHARE *share=info->s;
+ my_bool block_records= share->data_file_type == BLOCK_RECORD;
+
+ DBUG_ENTER("maria_extra");
+ DBUG_PRINT("enter",("function: %d",(int) function));
+
+ switch (function) {
+ case HA_EXTRA_RESET_STATE: /* Reset state (don't free buffers) */
+ info->lastinx= 0; /* Use first index as def */
+ info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed=1;
+ /* Next/prev gives first/last */
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+ (pbool) (info->lock_type != F_UNLCK),
+ (pbool) test(info->update & HA_STATE_ROW_CHANGED)
+ );
+ }
+ info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+ HA_STATE_PREV_FOUND);
+ break;
+ case HA_EXTRA_CACHE:
+ if (block_records)
+ break; /* Not supported */
+
+ if (info->lock_type == F_UNLCK &&
+ (share->options & HA_OPTION_PACK_RECORD))
+ {
+ error=1; /* Not possibly if not locked */
+ my_errno=EACCES;
+ break;
+ }
+ if (info->s->file_map) /* Don't use cache if mmap */
+ break;
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if ((share->options & HA_OPTION_COMPRESS_RECORD))
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ if (_ma_memmap_file(info))
+ {
+ /* We don't nead MADV_SEQUENTIAL if small file */
+ madvise(share->file_map,share->state.state.data_file_length,
+ share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ?
+ MADV_RANDOM : MADV_SEQUENTIAL);
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+#endif
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ info->opt_flag&= ~WRITE_CACHE_USED;
+ if ((error=end_io_cache(&info->rec_cache)))
+ break;
+ }
+ if (!(info->opt_flag &
+ (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED)))
+ {
+ cache_size= (extra_arg ? *(ulong*) extra_arg :
+ my_default_record_cache_size);
+ if (!(init_io_cache(&info->rec_cache, info->dfile.file,
+ (uint) min(info->state->data_file_length+1,
+ cache_size),
+ READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK),
+ MYF(share->write_flag & MY_WAIT_IF_FULL))))
+ {
+ info->opt_flag|=READ_CACHE_USED;
+ info->update&= ~HA_STATE_ROW_CHANGED;
+ }
+ if (share->concurrent_insert)
+ info->rec_cache.end_of_file=info->state->data_file_length;
+ }
+ break;
+ case HA_EXTRA_REINIT_CACHE:
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos,
+ (pbool) (info->lock_type != F_UNLCK),
+ (pbool) test(info->update & HA_STATE_ROW_CHANGED));
+ info->update&= ~HA_STATE_ROW_CHANGED;
+ if (share->concurrent_insert)
+ info->rec_cache.end_of_file=info->state->data_file_length;
+ }
+ break;
+ case HA_EXTRA_WRITE_CACHE:
+ if (info->lock_type == F_UNLCK)
+ {
+ error=1; /* Not possibly if not locked */
+ break;
+ }
+ if (block_records)
+ break; /* Not supported */
+
+ cache_size= (extra_arg ? *(ulong*) extra_arg :
+ my_default_record_cache_size);
+ if (!(info->opt_flag &
+ (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) &&
+ !share->state.header.uniques)
+ if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size,
+ WRITE_CACHE,info->state->data_file_length,
+ (pbool) (info->lock_type != F_UNLCK),
+ MYF(share->write_flag & MY_WAIT_IF_FULL))))
+ {
+ info->opt_flag|=WRITE_CACHE_USED;
+ info->update&= ~(HA_STATE_ROW_CHANGED |
+ HA_STATE_WRITE_AT_END |
+ HA_STATE_EXTEND_BLOCK);
+ }
+ break;
+ case HA_EXTRA_PREPARE_FOR_UPDATE:
+ if (info->s->data_file_type != DYNAMIC_RECORD)
+ break;
+ /* Remove read/write cache if dynamic rows */
+ case HA_EXTRA_NO_CACHE:
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ error=end_io_cache(&info->rec_cache);
+ /* Sergei will insert full text index caching here */
+ }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if (info->opt_flag & MEMMAP_USED)
+ madvise(share->file_map,share->state.state.data_file_length,MADV_RANDOM);
+#endif
+ break;
+ case HA_EXTRA_FLUSH_CACHE:
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ if ((error=flush_io_cache(&info->rec_cache)))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Fatal error found */
+ }
+ }
+ break;
+ case HA_EXTRA_NO_READCHECK:
+ info->opt_flag&= ~READ_CHECK_USED; /* No readcheck */
+ break;
+ case HA_EXTRA_READCHECK:
+ info->opt_flag|= READ_CHECK_USED;
+ break;
+ case HA_EXTRA_KEYREAD: /* Read only keys to record */
+ case HA_EXTRA_REMEMBER_POS:
+ info->opt_flag |= REMEMBER_OLD_POS;
+ bmove((uchar*) info->lastkey+share->base.max_key_length*2,
+ (uchar*) info->lastkey,info->lastkey_length);
+ info->save_update= info->update;
+ info->save_lastinx= info->lastinx;
+ info->save_lastpos= info->cur_row.lastpos;
+ info->save_lastkey_length=info->lastkey_length;
+ if (function == HA_EXTRA_REMEMBER_POS)
+ break;
+ /* fall through */
+ case HA_EXTRA_KEYREAD_CHANGE_POS:
+ info->opt_flag |= KEY_READ_USED;
+ info->read_record= _ma_read_key_record;
+ break;
+ case HA_EXTRA_NO_KEYREAD:
+ case HA_EXTRA_RESTORE_POS:
+ if (info->opt_flag & REMEMBER_OLD_POS)
+ {
+ bmove((uchar*) info->lastkey,
+ (uchar*) info->lastkey+share->base.max_key_length*2,
+ info->save_lastkey_length);
+ info->update= info->save_update | HA_STATE_WRITTEN;
+ info->lastinx= info->save_lastinx;
+ info->cur_row.lastpos= info->save_lastpos;
+ info->lastkey_length=info->save_lastkey_length;
+ }
+ info->read_record= share->read_record;
+ info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+ break;
+ case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */
+ info->lock_type= F_EXTRA_LCK; /* Simulate as locked */
+ break;
+ case HA_EXTRA_WAIT_LOCK:
+ info->lock_wait=0;
+ break;
+ case HA_EXTRA_NO_WAIT_LOCK:
+ info->lock_wait=MY_DONT_WAIT;
+ break;
+ case HA_EXTRA_NO_KEYS:
+ /* we're going to modify pieces of the state, stall Checkpoint */
+ pthread_mutex_lock(&share->intern_lock);
+ if (info->lock_type == F_UNLCK)
+ {
+ pthread_mutex_unlock(&share->intern_lock);
+ error=1; /* Not possibly if not lock */
+ break;
+ }
+ if (maria_is_any_key_active(share->state.key_map))
+ {
+ MARIA_KEYDEF *key=share->keyinfo;
+ uint i;
+ for (i=0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1)
+ {
+ maria_clear_key_active(share->state.key_map, i);
+ info->update|= HA_STATE_CHANGED;
+ }
+ }
+
+ if (!share->changed)
+ {
+ share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
+ share->changed=1; /* Update on close */
+ if (!share->global_changed)
+ {
+ share->global_changed=1;
+ share->state.open_count++;
+ }
+ }
+ share->state.state= *info->state;
+ /*
+ That state write to disk must be done, even for transactional tables;
+ indeed the table's share is going to be lost (there was a
+ HA_EXTRA_FORCE_REOPEN before, which set share->last_version to
+ 0), and so the only way it leaves information (share->state.key_map)
+ for the posterity is by writing it to disk.
+ */
+ DBUG_ASSERT(!maria_in_recovery);
+ error= _ma_state_info_write(share, 1|2);
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ case HA_EXTRA_FORCE_REOPEN:
+ /*
+ Normally MySQL uses this case when it is going to close all open
+ instances of the table, thus going to flush all data/index/state.
+ We however do a flush here for additional safety.
+ */
+ /** @todo consider porting these flush-es to MyISAM */
+ error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE) ||
+ _ma_state_info_write(share, 1|2|4);
+#ifdef ASK_MONTY
+ || (share->changed= 0);
+#endif
+ /**
+ @todo RECOVERY BUG
+ Though we flushed the state, IF some other thread may have the same
+ table (same MARIA_SHARE) open at this time then it may have a
+ more recent state to flush when it closes, thus we don't set
+ share->changed to 0 here. On the other hand, this means that when our
+ thread closes its table, it will flush the state again, then it would
+ overwrite any state written by yet another thread which may have opened
+ the table (new MARIA_SHARE) and done some updates.
+ ASK_MONTY about the IF above. See also same tag in
+ HA_EXTRA_PREPARE_FOR_DROP|RENAME.
+ */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */
+ /* this makes the share not be re-used next time the table is opened */
+ share->last_version= 0L; /* Impossible version */
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ break;
+ case HA_EXTRA_PREPARE_FOR_DROP:
+ case HA_EXTRA_PREPARE_FOR_RENAME:
+ {
+ my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ /*
+ This share, to have last_version=0, needs to save all its data/index
+ blocks to disk if this is not for a DROP TABLE. Otherwise they would be
+ invisible to future openers; and they could even go to disk late and
+ cancel the work of future openers.
+ On Windows, which cannot delete an open file (cannot drop an open table)
+ we have to close the table's files.
+ */
+ if (info->lock_type != F_UNLCK && !info->was_locked)
+ {
+ info->was_locked= info->lock_type;
+ if (maria_lock_database(info, F_UNLCK))
+ error= my_errno;
+ info->lock_type= F_UNLCK;
+ }
+ if (share->kfile.file >= 0)
+ _ma_decrement_open_count(info);
+ pthread_mutex_lock(&share->intern_lock);
+ enum flush_type type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED;
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ type, type))
+ {
+ error=my_errno;
+ share->changed=1;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ if (end_io_cache(&info->rec_cache))
+ error= 1;
+ }
+ if (share->kfile.file >= 0)
+ {
+ if (do_flush)
+ {
+ /*
+ Save the state so that others can find it from disk.
+ We have to sync now, as on Windows we are going to close the file
+ (so cannot sync later).
+ */
+ if (_ma_state_info_write(share, 1 | 2) ||
+ my_sync(share->kfile.file, MYF(0)))
+ error= my_errno;
+#ifdef ASK_MONTY /* see same tag in HA_EXTRA_FORCE_REOPEN */
+ else
+ share->changed= 0;
+#endif
+ }
+ else
+ {
+ /* be sure that state is not tried for write as file may be closed */
+ share->changed= 0;
+ }
+#ifdef __WIN__
+ if (my_close(share->kfile, MYF(0)))
+ error=my_errno;
+ share->kfile.file= -1;
+#endif
+ }
+ if (share->data_file_type == BLOCK_RECORD &&
+ share->bitmap.file.file >= 0)
+ {
+ if (do_flush && my_sync(share->bitmap.file.file, MYF(0)))
+ error= my_errno;
+#ifdef __WIN__
+ if (my_close(share->bitmap.file.file, MYF(0)))
+ error= my_errno;
+ share->bitmap.file.file= -1;
+#endif
+ }
+#ifdef __WIN__
+ {
+ LIST *list_element ;
+ for (list_element=maria_open_list ;
+ list_element ;
+ list_element=list_element->next)
+ {
+ MARIA_HA *tmpinfo=(MARIA_HA*) list_element->data;
+ if (tmpinfo->s == info->s)
+ {
+ if (share->data_file_type != BLOCK_RECORD &&
+ tmpinfo->dfile.file >= 0 &&
+ my_close(tmpinfo->dfile.file, MYF(0)))
+ error = my_errno;
+ tmpinfo->dfile.file= -1;
+ }
+ }
+ }
+#endif
+ /* For protection against Checkpoint, we set under intern_lock: */
+ share->last_version= 0L; /* Impossible version */
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ break;
+ }
+ case HA_EXTRA_FLUSH:
+ if (!share->temporary)
+ error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_KEEP, FLUSH_KEEP);
+#ifdef HAVE_PWRITE
+ _ma_decrement_open_count(info);
+#endif
+ if (share->not_flushed)
+ {
+ share->not_flushed=0;
+ if (_ma_sync_table_files(info))
+ error= my_errno;
+ if (error)
+ {
+ share->changed=1;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Fatal error found */
+ }
+ }
+ if (share->base.blobs && info->rec_buff_size >
+ share->base.default_rec_buff_size)
+ {
+ info->rec_buff_size= 1; /* Force realloc */
+ _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ share->base.default_rec_buff_size);
+ }
+ break;
+ case HA_EXTRA_NORMAL: /* Theese isn't in use */
+ info->quick_mode=0;
+ break;
+ case HA_EXTRA_QUICK:
+ info->quick_mode=1;
+ break;
+ case HA_EXTRA_NO_ROWS:
+ if (!share->state.header.uniques)
+ info->opt_flag|= OPT_NO_ROWS;
+ break;
+ case HA_EXTRA_PRELOAD_BUFFER_SIZE:
+ info->preload_buff_size= *((ulong *) extra_arg);
+ break;
+ case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+ case HA_EXTRA_CHANGE_KEY_TO_DUP:
+ maria_extra_keyflag(info, function);
+ break;
+ case HA_EXTRA_MMAP:
+#ifdef HAVE_MMAP
+ if (block_records)
+ break; /* Not supported */
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Memory map the data file if it is not already mapped. It is safe
+ to memory map a file while other threads are using file I/O on it.
+ Assigning a new address to a function pointer is an atomic
+ operation. intern_lock prevents that two or more mappings are done
+ at the same time.
+ */
+ if (!share->file_map)
+ {
+ if (_ma_dynmap_file(info, share->state.state.data_file_length))
+ {
+ DBUG_PRINT("warning",("mmap failed: errno: %d",errno));
+ error= my_errno= errno;
+ }
+ else
+ {
+ share->file_read= _ma_mmap_pread;
+ share->file_write= _ma_mmap_pwrite;
+ }
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+#endif
+ break;
+ case HA_EXTRA_MARK_AS_LOG_TABLE:
+ pthread_mutex_lock(&share->intern_lock);
+ share->is_log_table= TRUE;
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ case HA_EXTRA_KEY_CACHE:
+ case HA_EXTRA_NO_KEY_CACHE:
+ default:
+ break;
+ }
+ {
+ char tmp[1];
+ tmp[0]=function;
+ }
+ DBUG_RETURN(error);
+} /* maria_extra */
+
+
+/*
+ Start/Stop Inserting Duplicates Into a Table, WL#1648.
+*/
+
+static void maria_extra_keyflag(MARIA_HA *info,
+ enum ha_extra_function function)
+{
+ uint idx;
+
+ for (idx= 0; idx< info->s->base.keys; idx++)
+ {
+ switch (function) {
+ case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+ info->s->keyinfo[idx].flag|= HA_NOSAME;
+ break;
+ case HA_EXTRA_CHANGE_KEY_TO_DUP:
+ info->s->keyinfo[idx].flag&= ~(HA_NOSAME);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+
+int maria_reset(MARIA_HA *info)
+{
+ int error= 0;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_reset");
+ /*
+ Free buffers and reset the following flags:
+ EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK
+
+ If the row buffer cache is large (for dynamic tables), reduce it
+ to save memory.
+ */
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ error= end_io_cache(&info->rec_cache);
+ }
+ if (share->base.blobs && info->rec_buff_size >
+ share->base.default_rec_buff_size)
+ {
+ info->rec_buff_size= 1; /* Force realloc */
+ _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ share->base.default_rec_buff_size);
+ }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if (info->opt_flag & MEMMAP_USED)
+ madvise(share->file_map,share->state.state.data_file_length,MADV_RANDOM);
+#endif
+ info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+ info->quick_mode=0;
+ info->lastinx= 0; /* Use first index as def */
+ info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed= 1;
+ info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+ HA_STATE_PREV_FOUND);
+ DBUG_RETURN(error);
+}
+
+
+int _ma_sync_table_files(const MARIA_HA *info)
+{
+ return (my_sync(info->dfile.file, MYF(MY_WME)) ||
+ my_sync(info->s->kfile.file, MYF(MY_WME)));
+}
+
+
+/**
+ @brief flushes the data and/or index file of a table
+
+ This is useful when one wants to read a table using OS syscalls (like
+ my_copy()) and first wants to be sure that MySQL-level caches go down to
+ the OS so that OS syscalls can see all data. It can flush rec_cache,
+ bitmap, pagecache of data file, pagecache of index file.
+
+ @param info table
+ @param flush_data_or_index one or two of these flags:
+ MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX
+ @param flush_type_for_data
+ @param flush_type_for_index
+
+ @note does not sync files (@see _ma_sync_table_files()).
+ @note Progressively this function will be used in all places where we flush
+ the index but not the data file (probable bugs).
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index)
+{
+ MARIA_SHARE *share= info->s;
+ /* flush data file first because it's more critical */
+ if (flush_data_or_index & MARIA_FLUSH_DATA)
+ {
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ /* normally any code which creates a WRITE_CACHE destroys it later */
+ DBUG_ASSERT(0);
+ if (end_io_cache(&info->rec_cache))
+ goto err;
+ info->opt_flag&= ~WRITE_CACHE_USED;
+ }
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ if(_ma_flush_bitmap(share) ||
+ flush_pagecache_blocks(share->pagecache, &info->dfile,
+ flush_type_for_data))
+ goto err;
+ }
+ }
+ if ((flush_data_or_index & MARIA_FLUSH_INDEX) &&
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ flush_type_for_index))
+ goto err;
+ return 0;
+err:
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ return 1;
+}
diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c
new file mode 100644
index 00000000000..e09a076ceaa
--- /dev/null
+++ b/storage/maria/ma_ft_boolean_search.c
@@ -0,0 +1,975 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* TODO: add caching - pre-read several index entries at once */
+
+/*
+ Added optimization for full-text queries with plus-words. It was
+ implemented by sharing maximal document id (max_docid) variable
+ inside plus subtree. max_docid could be used by any word in plus
+ subtree, but it could be updated by plus-word only.
+
+ The idea is: there is no need to search for docid smaller than
+ biggest docid inside current plus subtree.
+
+ Examples:
+ +word1 word2
+ share same max_docid
+ max_docid updated by word1
+ +word1 +(word2 word3)
+ share same max_docid
+ max_docid updated by word1
+ +(word1 -word2) +(+word3 word4)
+ share same max_docid
+ max_docid updated by word3
+*/
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with boolean queries */
+
+static double _wghts[11]=
+{
+ 0.131687242798354,
+ 0.197530864197531,
+ 0.296296296296296,
+ 0.444444444444444,
+ 0.666666666666667,
+ 1.000000000000000,
+ 1.500000000000000,
+ 2.250000000000000,
+ 3.375000000000000,
+ 5.062500000000000,
+ 7.593750000000000};
+static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
+
+static double _nwghts[11]=
+{
+ -0.065843621399177,
+ -0.098765432098766,
+ -0.148148148148148,
+ -0.222222222222222,
+ -0.333333333333334,
+ -0.500000000000000,
+ -0.750000000000000,
+ -1.125000000000000,
+ -1.687500000000000,
+ -2.531250000000000,
+ -3.796875000000000};
+static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
+
+#define FTB_FLAG_TRUNC 1
+/* At most one of the following flags can be set */
+#define FTB_FLAG_YES 2
+#define FTB_FLAG_NO 4
+#define FTB_FLAG_WONLY 8
+
+typedef struct st_ftb_expr FTB_EXPR;
+struct st_ftb_expr
+{
+ FTB_EXPR *up;
+ uint flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+ my_off_t docid[2];
+ my_off_t max_docid;
+ float weight;
+ float cur_weight;
+ LIST *phrase; /* phrase words */
+ LIST *document; /* for phrase search */
+ uint yesses; /* number of "yes" words matched */
+ uint nos; /* number of "no" words matched */
+ uint ythresh; /* number of "yes" words in expr */
+ uint yweaks; /* number of "yes" words for scan only */
+};
+
+typedef struct st_ftb_word
+{
+ FTB_EXPR *up;
+ uint flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+ my_off_t docid[2]; /* for index search and for scan */
+ my_off_t key_root;
+ my_off_t *max_docid;
+ MARIA_KEYDEF *keyinfo;
+ struct st_ftb_word *prev;
+ float weight;
+ uint ndepth;
+ uint len;
+ uchar off;
+ uchar word[1];
+} FTB_WORD;
+
+typedef struct st_ft_info
+{
+ struct _ft_vft *please;
+ MARIA_HA *info;
+ CHARSET_INFO *charset;
+ FTB_EXPR *root;
+ FTB_WORD **list;
+ FTB_WORD *last_word;
+ MEM_ROOT mem_root;
+ QUEUE queue;
+ TREE no_dupes;
+ my_off_t lastpos;
+ uint keynr;
+ uchar with_scan;
+ enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
+} FTB;
+
+static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
+{
+ int i;
+
+ /* if a==curdoc, take it as a < b */
+ if (v && a->docid[0] == *v)
+ return -1;
+
+ /* ORDER BY docid, ndepth DESC */
+ i=CMP_NUM(a->docid[0], b->docid[0]);
+ if (!i)
+ i=CMP_NUM(b->ndepth,a->ndepth);
+ return i;
+}
+
+static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
+{
+ /* ORDER BY word DESC, ndepth DESC */
+ int i= ha_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1,
+ (uchar*) (*a)->word+1,(*a)->len-1,0,0);
+ if (!i)
+ i=CMP_NUM((*b)->ndepth,(*a)->ndepth);
+ return i;
+}
+
+
+typedef struct st_my_ftb_param
+{
+ FTB *ftb;
+ FTB_EXPR *ftbe;
+ uchar *up_quot;
+ uint depth;
+} MY_FTB_PARAM;
+
+
+static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
+ char *word, int word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *info)
+{
+ MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+ FTB_WORD *ftbw;
+ FTB_EXPR *ftbe, *tmp_expr;
+ FT_WORD *phrase_word;
+ LIST *tmp_element;
+ int r= info->weight_adjust;
+ float weight= (float)
+ (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
+
+ switch (info->type) {
+ case FT_TOKEN_WORD:
+ ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
+ sizeof(FTB_WORD) +
+ (info->trunc ? HA_MAX_KEY_BUFF :
+ word_len * ftb_param->ftb->charset->mbmaxlen +
+ HA_FT_WLEN +
+ ftb_param->ftb->info->s->rec_reflength));
+ ftbw->len= word_len + 1;
+ ftbw->flags= 0;
+ ftbw->off= 0;
+ if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
+ if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
+ if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
+ ftbw->weight= weight;
+ ftbw->up= ftb_param->ftbe;
+ ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
+ ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
+ ftbw->key_root= HA_OFFSET_ERROR;
+ memcpy(ftbw->word + 1, word, word_len);
+ ftbw->word[0]= word_len;
+ if (info->yesno > 0) ftbw->up->ythresh++;
+ ftb_param->ftb->queue.max_elements++;
+ ftbw->prev= ftb_param->ftb->last_word;
+ ftb_param->ftb->last_word= ftbw;
+ ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
+ for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
+ if (! (tmp_expr->flags & FTB_FLAG_YES))
+ break;
+ ftbw->max_docid= &tmp_expr->max_docid;
+ /* fall through */
+ case FT_TOKEN_STOPWORD:
+ if (! ftb_param->up_quot) break;
+ phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+ tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+ phrase_word->pos= word;
+ phrase_word->len= word_len;
+ tmp_element->data= (void *)phrase_word;
+ ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
+ /* Allocate document list at this point.
+ It allows to avoid huge amount of allocs/frees for each row.*/
+ tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+ tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+ ftb_param->ftbe->document=
+ list_add(ftb_param->ftbe->document, tmp_element);
+ break;
+ case FT_TOKEN_LEFT_PAREN:
+ ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
+ ftbe->flags= 0;
+ if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
+ if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
+ ftbe->weight= weight;
+ ftbe->up= ftb_param->ftbe;
+ ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+ ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
+ ftbe->phrase= NULL;
+ ftbe->document= 0;
+ if (info->quot) ftb_param->ftb->with_scan|= 2;
+ if (info->yesno > 0) ftbe->up->ythresh++;
+ ftb_param->ftbe= ftbe;
+ ftb_param->depth++;
+ ftb_param->up_quot= info->quot;
+ break;
+ case FT_TOKEN_RIGHT_PAREN:
+ if (ftb_param->ftbe->document)
+ {
+ /* Circuit document list */
+ for (tmp_element= ftb_param->ftbe->document;
+ tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
+ tmp_element->next= ftb_param->ftbe->document;
+ ftb_param->ftbe->document->prev= tmp_element;
+ }
+ info->quot= 0;
+ if (ftb_param->ftbe->up)
+ {
+ DBUG_ASSERT(ftb_param->depth);
+ ftb_param->ftbe= ftb_param->ftbe->up;
+ ftb_param->depth--;
+ ftb_param->up_quot= 0;
+ }
+ break;
+ case FT_TOKEN_EOF:
+ default:
+ break;
+ }
+ return(0);
+}
+
+
+static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
+ char *query, int len)
+{
+ MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+ MYSQL_FTPARSER_BOOLEAN_INFO info;
+ CHARSET_INFO *cs= ftb_param->ftb->charset;
+ uchar **start= (uchar**) &query;
+ char *end= query + len;
+ FT_WORD w;
+
+ info.prev= ' ';
+ info.quot= 0;
+ while (maria_ft_get_word(cs, start, end, &w, &info))
+ param->mysql_add_word(param, w.pos, w.len, &info);
+ return(0);
+}
+
+
+static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
+ struct st_mysql_ftparser *parser)
+{
+ MYSQL_FTPARSER_PARAM *param;
+ MY_FTB_PARAM ftb_param;
+ DBUG_ENTER("_ftb_parse_query");
+ DBUG_ASSERT(parser);
+
+ if (ftb->state != UNINITIALIZED)
+ DBUG_RETURN(0);
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+ DBUG_RETURN(1);
+
+ ftb_param.ftb= ftb;
+ ftb_param.depth= 0;
+ ftb_param.ftbe= ftb->root;
+ ftb_param.up_quot= 0;
+
+ param->mysql_parse= ftb_parse_query_internal;
+ param->mysql_add_word= ftb_query_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->cs= ftb->charset;
+ param->doc= query;
+ param->length= len;
+ param->flags= 0;
+ param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
+ DBUG_RETURN(parser->parse(param));
+}
+
+
+static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)),
+ const void *a,const void *b)
+{
+ return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
+}
+
+/* returns 1 if the search was finished (must-word wasn't found) */
+static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
+{
+ int r;
+ int subkeys=1;
+ my_bool can_go_down;
+ MARIA_HA *info=ftb->info;
+ uint off= 0, extra=HA_FT_WLEN+info->s->base.rec_reflength;
+ uchar *lastkey_buf= ftbw->word+ftbw->off;
+
+ if (ftbw->flags & FTB_FLAG_TRUNC)
+ lastkey_buf+=ftbw->len;
+
+ if (init_search)
+ {
+ ftbw->key_root=info->s->state.key_root[ftb->keynr];
+ ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+
+ r= _ma_search(info, ftbw->keyinfo, ftbw->word, ftbw->len,
+ SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
+ }
+ else
+ {
+ uint sflag= SEARCH_BIGGER;
+ if (ftbw->docid[0] < *ftbw->max_docid)
+ {
+ sflag|= SEARCH_SAME;
+ _ma_dpointer(info, (ftbw->word + ftbw->len + HA_FT_WLEN),
+ *ftbw->max_docid);
+ }
+ r= _ma_search(info, ftbw->keyinfo, lastkey_buf,
+ USE_WHOLE_KEY, sflag, ftbw->key_root);
+ }
+
+ can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
+ /* Skip rows inserted by concurrent insert */
+ while (!r)
+ {
+ if (can_go_down)
+ {
+ /* going down ? */
+ off=info->lastkey_length-extra;
+ subkeys=ft_sintXkorr(info->lastkey+off);
+ }
+ if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length)
+ break;
+ r= _ma_search_next(info, ftbw->keyinfo, info->lastkey,
+ info->lastkey_length,
+ SEARCH_BIGGER, ftbw->key_root);
+ }
+
+ if (!r && !ftbw->off)
+ {
+ r= ha_compare_text(ftb->charset,
+ (uchar*) info->lastkey+1,
+ info->lastkey_length-extra-1,
+ (uchar*) ftbw->word+1,
+ ftbw->len-1,
+ (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0);
+ }
+
+ if (r) /* not found */
+ {
+ if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
+ {
+ ftbw->docid[0]=HA_OFFSET_ERROR;
+ if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
+ {
+ /*
+ This word MUST BE present in every document returned,
+ so we can stop the search right now
+ */
+ ftb->state=INDEX_DONE;
+ return 1; /* search is done */
+ }
+ else
+ return 0;
+ }
+
+ /* going up to the first-level tree to continue search there */
+ _ma_dpointer(info, (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
+ ftbw->key_root=info->s->state.key_root[ftb->keynr];
+ ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+ ftbw->off=0;
+ return _ft2_search(ftb, ftbw, 0);
+ }
+
+ /* matching key found */
+ memcpy(lastkey_buf, info->lastkey, info->lastkey_length);
+ if (lastkey_buf == ftbw->word)
+ ftbw->len=info->lastkey_length-extra;
+
+ /* going down ? */
+ if (subkeys<0)
+ {
+ /*
+ yep, going down, to the second-level tree
+ TODO here: subkey-based optimization
+ */
+ ftbw->off=off;
+ ftbw->key_root= info->cur_row.lastpos;
+ ftbw->keyinfo=& info->s->ft2_keyinfo;
+ r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root);
+ DBUG_ASSERT(r==0); /* found something */
+ memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length);
+ }
+ ftbw->docid[0]= info->cur_row.lastpos;
+ if (ftbw->flags & FTB_FLAG_YES)
+ *ftbw->max_docid= info->cur_row.lastpos;
+ return 0;
+}
+
+static void _ftb_init_index_search(FT_INFO *ftb)
+{
+ int i;
+ FTB_WORD *ftbw;
+
+ if ((ftb->state != READY && ftb->state !=INDEX_DONE) ||
+ ftb->keynr == NO_SUCH_KEY)
+ return;
+ ftb->state=INDEX_SEARCH;
+
+ for (i=ftb->queue.elements; i; i--)
+ {
+ ftbw=(FTB_WORD *)(ftb->queue.root[i]);
+
+ if (ftbw->flags & FTB_FLAG_TRUNC)
+ {
+ /*
+ special treatment for truncation operator
+ 1. there are some (besides this) +words
+ | no need to search in the index, it can never ADD new rows
+ | to the result, and to remove half-matched rows we do scan anyway
+ 2. -trunc*
+ | same as 1.
+ 3. in 1 and 2, +/- need not be on the same expr. level,
+ but can be on any upper level, as in +word +(trunc1* trunc2*)
+ 4. otherwise
+ | We have to index-search for this prefix.
+ | It may cause duplicates, as in the index (sorted by <word,docid>)
+ | <aaaa,row1>
+ | <aabb,row2>
+ | <aacc,row1>
+ | Searching for "aa*" will find row1 twice...
+ */
+ FTB_EXPR *ftbe;
+ for (ftbe=(FTB_EXPR*)ftbw;
+ ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
+ ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
+ {
+ if (ftbe->flags & FTB_FLAG_NO || /* 2 */
+ ftbe->up->ythresh - ftbe->up->yweaks >1) /* 1 */
+ {
+ FTB_EXPR *top_ftbe=ftbe->up;
+ ftbw->docid[0]=HA_OFFSET_ERROR;
+ for (ftbe=(FTB_EXPR *)ftbw;
+ ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
+ ftbe=ftbe->up)
+ ftbe->up->yweaks++;
+ ftbe=0;
+ break;
+ }
+ }
+ if (!ftbe)
+ continue;
+ /* 4 */
+ if (!is_tree_inited(& ftb->no_dupes))
+ init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
+ _ftb_no_dupes_cmp,0,0,0);
+ else
+ reset_tree(& ftb->no_dupes);
+ }
+
+ ftbw->off=0; /* in case of reinit */
+ if (_ft2_search(ftb, ftbw, 1))
+ return;
+ }
+ queue_fix(& ftb->queue);
+}
+
+
+FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr, uchar *query,
+ uint query_len, CHARSET_INFO *cs)
+{
+ FTB *ftb;
+ FTB_EXPR *ftbe;
+ FTB_WORD *ftbw;
+
+ if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
+ return 0;
+ ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean;
+ ftb->state=UNINITIALIZED;
+ ftb->info=info;
+ ftb->keynr=keynr;
+ ftb->charset=cs;
+ DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
+ ftb->with_scan=0;
+ ftb->lastpos=HA_OFFSET_ERROR;
+ bzero(& ftb->no_dupes, sizeof(TREE));
+ ftb->last_word= 0;
+
+ init_alloc_root(&ftb->mem_root, 1024, 1024);
+ ftb->queue.max_elements= 0;
+ if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
+ goto err;
+ ftbe->weight=1;
+ ftbe->flags=FTB_FLAG_YES;
+ ftbe->nos=1;
+ ftbe->up=0;
+ ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+ ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
+ ftbe->phrase= NULL;
+ ftbe->document= 0;
+ ftb->root=ftbe;
+ if (unlikely(_ftb_parse_query(ftb, query, query_len,
+ keynr == NO_SUCH_KEY ? &ft_default_parser :
+ info->s->keyinfo[keynr].parser)))
+ goto err;
+ /*
+ Hack: instead of init_queue, we'll use reinit queue to be able
+ to alloc queue with alloc_root()
+ */
+ if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
+ (ftb->queue.max_elements + 1) *
+ sizeof(void *))))
+ goto err;
+ reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
+ (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
+ for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
+ queue_insert(&ftb->queue, (uchar *)ftbw);
+ ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
+ sizeof(FTB_WORD *)*ftb->queue.elements);
+ memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
+ qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
+ (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
+ if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
+ ftb->state=READY;
+ return ftb;
+err:
+ free_root(& ftb->mem_root, MYF(0));
+ my_free((uchar*)ftb,MYF(0));
+ return 0;
+}
+
+
+typedef struct st_my_ftb_phrase_param
+{
+ LIST *phrase;
+ LIST *document;
+ CHARSET_INFO *cs;
+ uint phrase_length;
+ uint document_length;
+ uint match;
+} MY_FTB_PHRASE_PARAM;
+
+
+static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
+ char *word, int word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
+{
+ MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+ FT_WORD *w= (FT_WORD *)phrase_param->document->data;
+ LIST *phrase, *document;
+ w->pos= word;
+ w->len= word_len;
+ phrase_param->document= phrase_param->document->prev;
+ if (phrase_param->phrase_length > phrase_param->document_length)
+ {
+ phrase_param->document_length++;
+ return 0;
+ }
+ /* TODO: rewrite phrase search to avoid
+ comparing the same word twice. */
+ for (phrase= phrase_param->phrase, document= phrase_param->document->next;
+ phrase; phrase= phrase->next, document= document->next)
+ {
+ FT_WORD *phrase_word= (FT_WORD *)phrase->data;
+ FT_WORD *document_word= (FT_WORD *)document->data;
+ if (my_strnncoll(phrase_param->cs,
+ (uchar*) phrase_word->pos, phrase_word->len,
+ (uchar*) document_word->pos, document_word->len))
+ return 0;
+ }
+ phrase_param->match++;
+ return 0;
+}
+
+
+static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
+ char *document, int len)
+{
+ FT_WORD word;
+ MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+ const char *docend= document + len;
+ while (maria_ft_simple_get_word(phrase_param->cs, (uchar**) &document,
+ docend, &word, FALSE))
+ {
+ param->mysql_add_word(param, word.pos, word.len, 0);
+ if (phrase_param->match)
+ break;
+ }
+ return 0;
+}
+
+
+/*
+ Checks if given buffer matches phrase list.
+
+ SYNOPSIS
+ _ftb_check_phrase()
+ s0 start of buffer
+ e0 end of buffer
+ phrase broken into list phrase
+ cs charset info
+
+ RETURN VALUE
+ 1 is returned if phrase found, 0 else.
+ -1 is returned if error occurs.
+*/
+
+static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
+ FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
+{
+ MY_FTB_PHRASE_PARAM ftb_param;
+ MYSQL_FTPARSER_PARAM *param;
+ DBUG_ENTER("_ftb_check_phrase");
+ DBUG_ASSERT(parser);
+
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
+ DBUG_RETURN(0);
+ ftb_param.phrase= ftbe->phrase;
+ ftb_param.document= ftbe->document;
+ ftb_param.cs= ftb->charset;
+ ftb_param.phrase_length= list_length(ftbe->phrase);
+ ftb_param.document_length= 1;
+ ftb_param.match= 0;
+
+ param->mysql_parse= ftb_check_phrase_internal;
+ param->mysql_add_word= ftb_phrase_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->cs= ftb->charset;
+ param->doc= (uchar *)document;
+ param->length= len;
+ param->flags= 0;
+ param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+ if (unlikely(parser->parse(param)))
+ return -1;
+ DBUG_RETURN(ftb_param.match ? 1 : 0);
+}
+
+
+static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
+{
+ FT_SEG_ITERATOR ftsi;
+ FTB_EXPR *ftbe;
+ float weight=ftbw->weight;
+ int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
+ my_off_t curdoc=ftbw->docid[mode];
+ struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+ &ft_default_parser :
+ ftb->info->s->keyinfo[ftb->keynr].parser;
+
+ for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
+ {
+ ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
+ if (ftbe->docid[mode] != curdoc)
+ {
+ ftbe->cur_weight=0;
+ ftbe->yesses=ftbe->nos=0;
+ ftbe->docid[mode]=curdoc;
+ }
+ if (ftbe->nos)
+ break;
+ if (yn_flag & FTB_FLAG_YES)
+ {
+ weight /= ftbe->ythresh;
+ ftbe->cur_weight += weight;
+ if ((int) ++ftbe->yesses == ythresh)
+ {
+ yn_flag=ftbe->flags;
+ weight=ftbe->cur_weight*ftbe->weight;
+ if (mode && ftbe->phrase)
+ {
+ int found= 0;
+
+ memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
+ while (_ma_ft_segiterator(&ftsi) && !found)
+ {
+ if (!ftsi.pos)
+ continue;
+ found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
+ if (unlikely(found < 0))
+ return 1;
+ }
+ if (!found)
+ break;
+ } /* ftbe->quot */
+ }
+ else
+ break;
+ }
+ else
+ if (yn_flag & FTB_FLAG_NO)
+ {
+ /*
+ NOTE: special sort function of queue assures that all
+ (yn_flag & FTB_FLAG_NO) != 0
+ events for every particular subexpression will
+ "auto-magically" happen BEFORE all the
+ (yn_flag & FTB_FLAG_YES) != 0 events. So no
+ already matched expression can become not-matched again.
+ */
+ ++ftbe->nos;
+ break;
+ }
+ else
+ {
+ if (ftbe->ythresh)
+ weight/=3;
+ ftbe->cur_weight += weight;
+ if ((int) ftbe->yesses < ythresh)
+ break;
+ if (!(yn_flag & FTB_FLAG_WONLY))
+ yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
+ weight*= ftbe->weight;
+ }
+ }
+ return 0;
+}
+
+
+int maria_ft_boolean_read_next(FT_INFO *ftb, char *record)
+{
+ FTB_EXPR *ftbe;
+ FTB_WORD *ftbw;
+ MARIA_HA *info=ftb->info;
+ my_off_t curdoc;
+
+ if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
+ return -1;
+
+ /* black magic ON */
+ if ((int) _ma_check_index(info, ftb->keynr) < 0)
+ return my_errno;
+ if (_ma_readinfo(info, F_RDLCK, 1))
+ return my_errno;
+ /* black magic OFF */
+
+ if (!ftb->queue.elements)
+ return my_errno=HA_ERR_END_OF_FILE;
+
+ /* Attention!!! Address of a local variable is used here! See err: label */
+ ftb->queue.first_cmp_arg=(void *)&curdoc;
+
+ while (ftb->state == INDEX_SEARCH &&
+ (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
+ HA_OFFSET_ERROR)
+ {
+ while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
+ {
+ if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ goto err;
+ }
+
+ /* update queue */
+ _ft2_search(ftb, ftbw, 0);
+ queue_replaced(& ftb->queue);
+ }
+
+ ftbe=ftb->root;
+ if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
+ ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
+ {
+ /* curdoc matched ! */
+ if (is_tree_inited(&ftb->no_dupes) &&
+ tree_insert(&ftb->no_dupes, &curdoc, 0,
+ ftb->no_dupes.custom_arg)->count >1)
+ /* but it managed already to get past this line once */
+ continue;
+
+ info->cur_row.lastpos= curdoc;
+ /* Clear all states, except that the table was updated */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ if (!(*info->read_record)(info, record, curdoc))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ if (ftb->with_scan && maria_ft_boolean_find_relevance(ftb,record,0)==0)
+ continue; /* no match */
+ my_errno=0;
+ goto err;
+ }
+ goto err;
+ }
+ }
+ ftb->state=INDEX_DONE;
+ my_errno=HA_ERR_END_OF_FILE;
+err:
+ ftb->queue.first_cmp_arg=(void *)0;
+ return my_errno;
+}
+
+
+typedef struct st_my_ftb_find_param
+{
+ FT_INFO *ftb;
+ FT_SEG_ITERATOR *ftsi;
+} MY_FTB_FIND_PARAM;
+
+
+static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
+ char *word, int len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
+{
+ MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+ FT_INFO *ftb= ftb_param->ftb;
+ FTB_WORD *ftbw;
+ int a, b, c;
+ for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
+ {
+ ftbw= ftb->list[c];
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
+ (uchar*)ftbw->word+1, ftbw->len-1,
+ (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) > 0)
+ b= c;
+ else
+ a= c;
+ }
+ for (; c >= 0; c--)
+ {
+ ftbw= ftb->list[c];
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
+ (uchar*)ftbw->word + 1,ftbw->len - 1,
+ (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
+ break;
+ if (ftbw->docid[1] == ftb->info->cur_row.lastpos)
+ continue;
+ ftbw->docid[1]= ftb->info->cur_row.lastpos;
+ if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
+ return 1;
+ }
+ return(0);
+}
+
+
+static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
+ char *doc, int len)
+{
+ MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+ FT_INFO *ftb= ftb_param->ftb;
+ char *end= doc + len;
+ FT_WORD w;
+ while (maria_ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
+ param->mysql_add_word(param, w.pos, w.len, 0);
+ return(0);
+}
+
+
+float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
+{
+ FTB_EXPR *ftbe;
+ FT_SEG_ITERATOR ftsi, ftsi2;
+ MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos;
+ MY_FTB_FIND_PARAM ftb_param;
+ MYSQL_FTPARSER_PARAM *param;
+ struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+ &ft_default_parser :
+ ftb->info->s->keyinfo[ftb->keynr].parser;
+
+ if (docid == HA_OFFSET_ERROR)
+ return -2.0;
+ if (!ftb->queue.elements)
+ return 0;
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+ return 0;
+
+ if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
+ {
+ FTB_EXPR *x;
+ uint i;
+
+ for (i=0; i < ftb->queue.elements; i++)
+ {
+ ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
+ for (x=ftb->list[i]->up; x; x=x->up)
+ x->docid[1]=HA_OFFSET_ERROR;
+ }
+ }
+
+ ftb->lastpos=docid;
+
+ if (ftb->keynr==NO_SUCH_KEY)
+ _ma_ft_segiterator_dummy_init(record, length, &ftsi);
+ else
+ _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
+ memcpy(&ftsi2, &ftsi, sizeof(ftsi));
+
+ ftb_param.ftb= ftb;
+ ftb_param.ftsi= &ftsi2;
+ param->mysql_parse= ftb_find_relevance_parse;
+ param->mysql_add_word= ftb_find_relevance_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->flags= 0;
+ param->cs= ftb->charset;
+ param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+ while (_ma_ft_segiterator(&ftsi))
+ {
+ if (!ftsi.pos)
+ continue;
+ param->doc= (uchar *)ftsi.pos;
+ param->length= ftsi.len;
+ if (unlikely(parser->parse(param)))
+ return 0;
+ }
+ ftbe=ftb->root;
+ if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
+ ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
+ { /* row matched ! */
+ return ftbe->cur_weight;
+ }
+ else
+ { /* match failed ! */
+ return 0.0;
+ }
+}
+
+
+void maria_ft_boolean_close_search(FT_INFO *ftb)
+{
+ if (is_tree_inited(& ftb->no_dupes))
+ {
+ delete_tree(& ftb->no_dupes);
+ }
+ free_root(& ftb->mem_root, MYF(0));
+ my_free((uchar*)ftb,MYF(0));
+}
+
+
+float maria_ft_boolean_get_relevance(FT_INFO *ftb)
+{
+ return ftb->root->cur_weight;
+}
+
+
+void maria_ft_boolean_reinit_search(FT_INFO *ftb)
+{
+ _ftb_init_index_search(ftb);
+}
diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c
new file mode 100644
index 00000000000..5fc67c6c664
--- /dev/null
+++ b/storage/maria/ma_ft_eval.c
@@ -0,0 +1,254 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_eval.h"
+#include <stdarg.h>
+#include <my_getopt.h>
+
+static void print_error(int exit_code, const char *fmt,...);
+static void get_options(int argc, char *argv[]);
+static int create_record(char *pos, FILE *file);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+ {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+ MARIA_HA *file;
+ int i,j;
+
+ MY_INIT(argv[0]);
+ get_options(argc,argv);
+ bzero((char*)recinfo,sizeof(recinfo));
+
+ maria_init();
+ /* First define 2 columns */
+ recinfo[0].type=FIELD_SKIP_ENDSPACE;
+ recinfo[0].length=docid_length;
+ recinfo[1].type=FIELD_BLOB;
+ recinfo[1].length= 4+portable_sizeof_char_ptr;
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].flag= HA_BLOB_PART;
+ keyinfo[0].seg[0].start=recinfo[0].length;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit=0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].bit_start=4;
+ keyinfo[0].seg[0].language=MY_CHARSET_CURRENT;
+ keyinfo[0].flag = HA_FULLTEXT;
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+ if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0))
+ goto err;
+ if (!(file=maria_open(filename,2,0)))
+ goto err;
+ if (!silent)
+ printf("Initializing stopwords\n");
+ maria_ft_init_stopwords(stopwordlist);
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ my_errno=0;
+ i=0;
+ while (create_record(record,df))
+ {
+ error=maria_write(file,record);
+ if (error)
+ printf("I= %2d maria_write: %d errno: %d\n",i,error,my_errno);
+ i++;
+ }
+ fclose(df);
+
+ if (maria_close(file)) goto err;
+ if (!silent)
+ printf("- Reopening file\n");
+ if (!(file=maria_open(filename,2,0))) goto err;
+ if (!silent)
+ printf("- Reading rows with key\n");
+ for (i=1;create_record(record,qf);i++)
+ {
+ FT_DOCLIST *result;
+ double w;
+ int t, err;
+
+ result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1);
+ if (!result)
+ {
+ printf("Query %d failed with errno %3d\n",i,my_errno);
+ goto err;
+ }
+ if (!silent)
+ printf("Query %d. Found: %d.\n",i,result->ndocs);
+ for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++)
+ {
+ t=uint2korr(read_record);
+ w=maria_ft_nlq_get_relevance(result);
+ printf("%d %.*s %f\n",i,t,read_record+2,w);
+ }
+ if (err != HA_ERR_END_OF_FILE)
+ {
+ printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+ goto err;
+ }
+ maria_ft_nlq_close_search(result);
+ }
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+
+ err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch (optid) {
+ case 's':
+ if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords)
+ break;
+ {
+ FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT;
+
+ if (!(stopwordlist=(const char**) malloc(n*sizeof(char *))))
+ print_error(1,"malloc(%d)",n*sizeof(char *));
+ if (!(f=fopen(argument,"r")))
+ print_error(1,"fopen(%s)",argument);
+ while (!feof(f))
+ {
+ if (!(fgets(s,HA_FT_MAXLEN,f)))
+ print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument);
+ if (!(stopwordlist[i++]=strdup(s)))
+ print_error(1,"strdup(%s)",s);
+ if (i >= n)
+ {
+ n+=SWL_PLUS;
+ if (!(stopwordlist=(const char**) realloc((char*) stopwordlist,
+ n*sizeof(char *))))
+ print_error(1,"realloc(%d)",n*sizeof(char *));
+ }
+ }
+ fclose(f);
+ stopwordlist[i]=NULL;
+ break;
+ }
+ case 'q': silent=1; break;
+ case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break;
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case 'V':
+ case '?':
+ case 'h':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (!(d_file=argv[optind])) print_error(1,"No d_file");
+ if (!(df=fopen(d_file,"r")))
+ print_error(1,"fopen(%s)",d_file);
+ if (!(q_file=argv[optind+1])) print_error(1,"No q_file");
+ if (!(qf=fopen(q_file,"r")))
+ print_error(1,"fopen(%s)",q_file);
+ return;
+} /* get options */
+
+
+static int create_record(char *pos, FILE *file)
+{
+ uint tmp; char *ptr;
+
+ bzero((char *)pos,MAX_REC_LENGTH);
+
+ /* column 1 - VARCHAR */
+ if (!(fgets(pos+2,MAX_REC_LENGTH-32,file)))
+ {
+ if (feof(file))
+ return 0;
+ else
+ print_error(1,"fgets(docid) - 1");
+ }
+ tmp=(uint) strlen(pos+2)-1;
+ int2store(pos,tmp);
+ pos+=recinfo[0].length;
+
+ /* column 2 - BLOB */
+
+ if (!(fgets(blob_record,MAX_BLOB_LENGTH,file)))
+ print_error(1,"fgets(docid) - 2");
+ tmp=(uint) strlen(blob_record);
+ int4store(pos,tmp);
+ ptr=blob_record;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ return 1;
+}
+
+/* VARARGS */
+
+static void print_error(int exit_code, const char *fmt,...)
+{
+ va_list args;
+
+ va_start(args,fmt);
+ fprintf(stderr,"%s: error: ",my_progname);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ exit(exit_code);
+}
+
+
+static void usage()
+{
+ printf("%s [options]\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h
new file mode 100644
index 00000000000..481943dfb0b
--- /dev/null
+++ b/storage/maria/ma_ft_eval.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+const char **stopwordlist=maria_ft_precompiled_stopwords;
+
+#define MAX_REC_LENGTH 128
+#define MAX_BLOB_LENGTH 60000
+char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH];
+char blob_record[MAX_BLOB_LENGTH+20*20];
+
+char *filename= (char*) "EVAL";
+
+int silent=0, error=0;
+
+uint key_length=MAX_BLOB_LENGTH,docid_length=32;
+char *d_file, *q_file;
+FILE *df,*qf;
+
+MARIA_COLUMNDEF recinfo[3];
+MARIA_KEYDEF keyinfo[2];
+HA_KEYSEG keyseg[10];
+
+#define SWL_INIT 500
+#define SWL_PLUS 50
+
+#define MAX_LINE_LENGTH 128
+char line[MAX_LINE_LENGTH];
diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c
new file mode 100644
index 00000000000..18b101f0e05
--- /dev/null
+++ b/storage/maria/ma_ft_nlq_search.c
@@ -0,0 +1,374 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with natural language queries */
+
+typedef struct ft_doc_rec
+{
+ my_off_t dpos;
+ double weight;
+} FT_DOC;
+
+struct st_ft_info
+{
+ struct _ft_vft *please;
+ MARIA_HA *info;
+ int ndocs;
+ int curdoc;
+ FT_DOC doc[1];
+};
+
+typedef struct st_all_in_one
+{
+ MARIA_HA *info;
+ uint keynr;
+ CHARSET_INFO *charset;
+ uchar *keybuff;
+ TREE dtree;
+} ALL_IN_ONE;
+
+typedef struct st_ft_superdoc
+{
+ FT_DOC doc;
+ FT_WORD *word_ptr;
+ double tmp_weight;
+} FT_SUPERDOC;
+
+static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
+ FT_SUPERDOC *p1, FT_SUPERDOC *p2)
+{
+ if (p1->doc.dpos < p2->doc.dpos)
+ return -1;
+ if (p1->doc.dpos == p2->doc.dpos)
+ return 0;
+ return 1;
+}
+
+static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
+{
+ int subkeys, r;
+ uint keylen, doc_cnt;
+ FT_SUPERDOC sdoc, *sptr;
+ TREE_ELEMENT *selem;
+ double gweight=1;
+ MARIA_HA *info= aio->info;
+ uchar *keybuff= (uchar*) aio->keybuff;
+ MARIA_KEYDEF *keyinfo=info->s->keyinfo+aio->keynr;
+ my_off_t key_root=info->s->state.key_root[aio->keynr];
+ uint extra=HA_FT_WLEN+info->s->base.rec_reflength;
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ float tmp_weight;
+#else
+#error
+#endif
+
+ DBUG_ENTER("walk_and_match");
+
+ word->weight=LWS_FOR_QUERY;
+
+ keylen= _ma_ft_make_key(info,aio->keynr,(char*) keybuff,word,0);
+ keylen-=HA_FT_WLEN;
+ doc_cnt=0;
+
+ /* Skip rows inserted by current inserted */
+ for (r= _ma_search(info, keyinfo, keybuff, keylen, SEARCH_FIND, key_root) ;
+ !r &&
+ (subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
+ info->cur_row.lastpos >= info->state->data_file_length ;
+ r= _ma_search_next(info, keyinfo, info->lastkey,
+ info->lastkey_length, SEARCH_BIGGER, key_root))
+ ;
+
+ info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */
+
+ /* The following should be safe, even if we compare doubles */
+ while (!r && gweight)
+ {
+
+ if (keylen &&
+ ha_compare_text(aio->charset,
+ (uchar*) info->lastkey+1, info->lastkey_length-extra-1,
+ (uchar*) keybuff+1, keylen-1, 0, 0))
+ break;
+
+ if (subkeys<0)
+ {
+ if (doc_cnt)
+ DBUG_RETURN(1); /* index is corrupted */
+ /*
+ TODO here: unsafe optimization, should this word
+ be skipped (based on subkeys) ?
+ */
+ keybuff+=keylen;
+ keyinfo=& info->s->ft2_keyinfo;
+ key_root= info->cur_row.lastpos;
+ keylen=0;
+ r= _ma_search_first(info, keyinfo, key_root);
+ goto do_skip;
+ }
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ tmp_weight=*(float*)&subkeys;
+#else
+#error
+#endif
+ /* The following should be safe, even if we compare doubles */
+ if (tmp_weight==0)
+ DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
+
+ sdoc.doc.dpos= info->cur_row.lastpos;
+
+ /* saving document matched into dtree */
+ if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
+ DBUG_RETURN(1);
+
+ sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
+
+ if (selem->count==1) /* document's first match */
+ sptr->doc.weight=0;
+ else
+ sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
+
+ sptr->word_ptr=word;
+ sptr->tmp_weight=tmp_weight;
+
+ doc_cnt++;
+
+ gweight=word->weight*GWS_IN_USE;
+ if (gweight < 0 || doc_cnt > 2000000)
+ gweight=0;
+
+ if (_ma_test_if_changed(info) == 0)
+ r= _ma_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
+ SEARCH_BIGGER, key_root);
+ else
+ r= _ma_search(info, keyinfo, info->lastkey, info->lastkey_length,
+ SEARCH_BIGGER, key_root);
+do_skip:
+ while ((subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
+ !r && info->cur_row.lastpos >= info->state->data_file_length)
+ r= _ma_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
+ SEARCH_BIGGER, key_root);
+
+ }
+ word->weight=gweight;
+
+ DBUG_RETURN(0);
+}
+
+
+static int walk_and_copy(FT_SUPERDOC *from,
+ uint32 count __attribute__((unused)), FT_DOC **to)
+{
+ DBUG_ENTER("walk_and_copy");
+ from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+ (*to)->dpos=from->doc.dpos;
+ (*to)->weight=from->doc.weight;
+ (*to)++;
+ DBUG_RETURN(0);
+}
+
+static int walk_and_push(FT_SUPERDOC *from,
+ uint32 count __attribute__((unused)), QUEUE *best)
+{
+ DBUG_ENTER("walk_and_copy");
+ from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+ set_if_smaller(best->elements, ft_query_expansion_limit-1);
+ queue_insert(best, (uchar *)& from->doc);
+ DBUG_RETURN(0);
+}
+
+
+static int FT_DOC_cmp(void *unused __attribute__((unused)),
+ FT_DOC *a, FT_DOC *b)
+{
+ return sgn(b->weight - a->weight);
+}
+
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query,
+ uint query_len, uint flags, uchar *record)
+{
+ TREE wtree;
+ ALL_IN_ONE aio;
+ FT_DOC *dptr;
+ FT_INFO *dlist=NULL;
+ MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos;
+ struct st_mysql_ftparser *parser;
+ MYSQL_FTPARSER_PARAM *ftparser_param;
+ DBUG_ENTER("maria_ft_init_nlq_search");
+
+ /* black magic ON */
+ if ((int) (keynr = _ma_check_index(info,keynr)) < 0)
+ DBUG_RETURN(NULL);
+ if (_ma_readinfo(info,F_RDLCK,1))
+ DBUG_RETURN(NULL);
+ /* black magic OFF */
+
+ aio.info=info;
+ aio.keynr=keynr;
+ aio.charset=info->s->keyinfo[keynr].seg->charset;
+ aio.keybuff= (uchar*) info->lastkey+info->s->base.max_key_length;
+ parser= info->s->keyinfo[keynr].parser;
+ if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0)))
+ goto err;
+
+ bzero(&wtree,sizeof(wtree));
+
+ init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
+ NULL, NULL);
+
+ maria_ft_parse_init(&wtree, aio.charset);
+ ftparser_param->flags= 0;
+ if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param,
+ &wtree.mem_root))
+ goto err;
+
+ if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+ left_root_right))
+ goto err;
+
+ if (flags & FT_EXPAND && ft_query_expansion_limit)
+ {
+ QUEUE best;
+ init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
+ 0);
+ tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
+ &best, left_root_right);
+ while (best.elements)
+ {
+ my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
+ if (!(*info->read_record)(info, record, docid))
+ {
+ info->update|= HA_STATE_AKTIV;
+ ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
+ if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param,
+ &wtree.mem_root)))
+ {
+ delete_queue(&best);
+ goto err;
+ }
+ }
+ }
+ delete_queue(&best);
+ reset_tree(&aio.dtree);
+ if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+ left_root_right))
+ goto err;
+
+ }
+
+ /*
+ If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
+ so if ndocs == 0, FT_INFO.doc[] must not be accessed.
+ */
+ dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
+ sizeof(FT_DOC)*
+ (int)(aio.dtree.elements_in_tree-1),
+ MYF(0));
+ if (!dlist)
+ goto err;
+
+ dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq;
+ dlist->ndocs=aio.dtree.elements_in_tree;
+ dlist->curdoc=-1;
+ dlist->info=aio.info;
+ dptr=dlist->doc;
+
+ tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
+ &dptr, left_root_right);
+
+ if (flags & FT_SORTED)
+ qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort2_cmp)&FT_DOC_cmp, 0);
+
+err:
+ delete_tree(&aio.dtree);
+ delete_tree(&wtree);
+ info->cur_row.lastpos= saved_lastpos;
+ DBUG_RETURN(dlist);
+}
+
+
+int maria_ft_nlq_read_next(FT_INFO *handler, char *record)
+{
+ MARIA_HA *info= (MARIA_HA *) handler->info;
+
+ if (++handler->curdoc >= handler->ndocs)
+ {
+ --handler->curdoc;
+ return HA_ERR_END_OF_FILE;
+ }
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ info->cur_row.lastpos= handler->doc[handler->curdoc].dpos;
+ if (!(*info->read_record)(info, record, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ return 0;
+ }
+ return my_errno;
+}
+
+
+float maria_ft_nlq_find_relevance(FT_INFO *handler,
+ uchar *record __attribute__((unused)),
+ uint length __attribute__((unused)))
+{
+ int a,b,c;
+ FT_DOC *docs=handler->doc;
+ MARIA_RECORD_POS docid= handler->info->cur_row.lastpos;
+
+ if (docid == HA_POS_ERROR)
+ return -5.0;
+
+ /* Assuming docs[] is sorted by dpos... */
+
+ for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
+ {
+ if (docs[c].dpos > docid)
+ b=c;
+ else
+ a=c;
+ }
+ /* bounds check to avoid accessing unallocated handler->doc */
+ if (a < handler->ndocs && docs[a].dpos == docid)
+ return (float) docs[a].weight;
+ else
+ return 0.0;
+}
+
+
+void maria_ft_nlq_close_search(FT_INFO *handler)
+{
+ my_free((uchar*)handler,MYF(0));
+}
+
+
+float maria_ft_nlq_get_relevance(FT_INFO *handler)
+{
+ return (float) handler->doc[handler->curdoc].weight;
+}
+
+
+void maria_ft_nlq_reinit_search(FT_INFO *handler)
+{
+ handler->curdoc=-1;
+}
+
diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c
new file mode 100644
index 00000000000..2cbbb2dc5f7
--- /dev/null
+++ b/storage/maria/ma_ft_parser.c
@@ -0,0 +1,426 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#include "ma_ftdefs.h"
+
+typedef struct st_maria_ft_docstat {
+ FT_WORD *list;
+ uint uniq;
+ double sum;
+} FT_DOCSTAT;
+
+
+typedef struct st_my_maria_ft_parser_param
+{
+ TREE *wtree;
+ MEM_ROOT *mem_root;
+} MY_FT_PARSER_PARAM;
+
+
+static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
+{
+ return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
+ (uchar*) w2->pos, w2->len, 0, 0);
+}
+
+static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
+{
+ word->weight=LWS_IN_USE;
+ docstat->sum+=word->weight;
+ memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD));
+ return 0;
+}
+
+/* transforms tree of words into the array, applying normalization */
+
+FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
+{
+ FT_WORD *wlist,*p;
+ FT_DOCSTAT docstat;
+ DBUG_ENTER("maria_ft_linearize");
+
+ if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
+ (1+wtree->elements_in_tree))))
+ {
+ docstat.list=wlist;
+ docstat.uniq=wtree->elements_in_tree;
+ docstat.sum=0;
+ tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
+ }
+ delete_tree(wtree);
+ if (!wlist)
+ DBUG_RETURN(NULL);
+
+ docstat.list->pos=NULL;
+
+ for (p=wlist;p->pos;p++)
+ {
+ p->weight=PRENORM_IN_USE;
+ }
+
+ for (p=wlist;p->pos;p++)
+ {
+ p->weight/=NORM_IN_USE;
+ }
+
+ DBUG_RETURN(wlist);
+}
+
+my_bool maria_ft_boolean_check_syntax_string(const uchar *str)
+{
+ uint i, j;
+
+ if (!str ||
+ (strlen(str)+1 != sizeof(ft_boolean_syntax)) ||
+ (str[0] != ' ' && str[1] != ' '))
+ return 1;
+ for (i=0; i<sizeof(ft_boolean_syntax); i++)
+ {
+ /* limiting to 7-bit ascii only */
+ if ((unsigned char)(str[i]) > 127 ||
+ my_isalnum(default_charset_info, str[i]))
+ return 1;
+ for (j=0; j<i; j++)
+ if (str[i] == str[j] && (i != 11 || j != 10))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ RETURN VALUE
+ 0 - eof
+ 1 - word found
+ 2 - left bracket
+ 3 - right bracket
+ 4 - stopword found
+*/
+uchar maria_ft_get_word(CHARSET_INFO *cs, uchar **start, uchar *end,
+ FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
+{
+ uchar *doc=*start;
+ int ctype;
+ uint mwc, length, mbl;
+
+ param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+ param->weight_adjust= param->wasign= 0;
+ param->type= FT_TOKEN_EOF;
+
+ while (doc<end)
+ {
+ for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
+ if (*doc == FTB_RQUOT && param->quot)
+ {
+ param->quot=doc;
+ *start=doc+1;
+ param->type= FT_TOKEN_RIGHT_PAREN;
+ goto ret;
+ }
+ if (!param->quot)
+ {
+ if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
+ {
+ /* param->prev=' '; */
+ *start=doc+1;
+ if (*doc == FTB_LQUOT) param->quot=*start;
+ param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
+ goto ret;
+ }
+ if (param->prev == ' ')
+ {
+ if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
+ if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
+ if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
+ if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
+ if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
+ if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
+ }
+ }
+ param->prev=*doc;
+ param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+ param->weight_adjust= param->wasign= 0;
+ }
+
+ mwc=length=0;
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ mwc=0;
+ else if (!misc_word_char(*doc) || mwc)
+ break;
+ else
+ mwc++;
+ }
+ param->prev='A'; /* be sure *prev is true_word_char */
+ word->len= (uint)(doc-word->pos) - mwc;
+ if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
+ doc++;
+
+ if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len))
+ || param->trunc) && length < ft_max_word_len)
+ {
+ *start=doc;
+ param->type= FT_TOKEN_WORD;
+ goto ret;
+ }
+ else if (length) /* make sure length > 0 (if start contains spaces only) */
+ {
+ *start= doc;
+ param->type= FT_TOKEN_STOPWORD;
+ goto ret;
+ }
+ }
+ if (param->quot)
+ {
+ param->quot=*start=doc;
+ param->type= 3; /* FT_RBR */
+ goto ret;
+ }
+ret:
+ return param->type;
+}
+
+uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start,
+ const uchar *end, FT_WORD *word,
+ my_bool skip_stopwords)
+{
+ uchar *doc= *start;
+ uint mwc, length, mbl;
+ int ctype;
+ DBUG_ENTER("maria_ft_simple_get_word");
+
+ do
+ {
+ for (;; doc+= (mbl > 0 ? mbl : 1))
+ {
+ if (doc >= end)
+ DBUG_RETURN(0);
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
+ }
+
+ mwc= length= 0;
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ mwc= 0;
+ else if (!misc_word_char(*doc) || mwc)
+ break;
+ else
+ mwc++;
+ }
+
+ word->len= (uint)(doc-word->pos) - mwc;
+
+ if (skip_stopwords == FALSE ||
+ (length >= ft_min_word_len && length < ft_max_word_len &&
+ !is_stopword(word->pos, word->len)))
+ {
+ *start= doc;
+ DBUG_RETURN(1);
+ }
+ } while (doc < end);
+ DBUG_RETURN(0);
+}
+
+void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
+{
+ DBUG_ENTER("maria_ft_parse_init");
+ if (!is_tree_inited(wtree))
+ init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
+ DBUG_VOID_RETURN;
+}
+
+
+static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param,
+ char *word, int word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
+{
+ TREE *wtree;
+ FT_WORD w;
+ MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+ DBUG_ENTER("maria_ft_add_word");
+ wtree= ft_param->wtree;
+ if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
+ {
+ uchar *ptr;
+ DBUG_ASSERT(wtree->with_delete == 0);
+ ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
+ memcpy(ptr, word, word_len);
+ w.pos= ptr;
+ }
+ else
+ w.pos= word;
+ w.len= word_len;
+ if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
+ {
+ delete_tree(wtree);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
+ char *doc_arg, int doc_len)
+{
+ uchar *doc= (uchar*) doc_arg;
+ uchar *end= doc + doc_len;
+ MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+ TREE *wtree= ft_param->wtree;
+ FT_WORD w;
+ DBUG_ENTER("maria_ft_parse_internal");
+
+ while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
+ if (param->mysql_add_word(param, w.pos, w.len, 0))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+int maria_ft_parse(TREE *wtree, uchar *doc, int doclen,
+ struct st_mysql_ftparser *parser,
+ MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+ MY_FT_PARSER_PARAM my_param;
+ DBUG_ENTER("maria_ft_parse");
+ DBUG_ASSERT(parser);
+ my_param.wtree= wtree;
+ my_param.mem_root= mem_root;
+
+ param->mysql_parse= maria_ft_parse_internal;
+ param->mysql_add_word= maria_ft_add_word;
+ param->mysql_ftparam= &my_param;
+ param->cs= wtree->custom_arg;
+ param->doc= doc;
+ param->length= doclen;
+ param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+ DBUG_RETURN(parser->parse(param));
+}
+
+
+#define MAX_PARAM_NR 2
+MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+ uint keynr, uint paramnr)
+{
+ uint32 ftparser_nr;
+ struct st_mysql_ftparser *parser;
+ if (! info->ftparser_param)
+ {
+ /* info->ftparser_param can not be zero after the initialization,
+ because it always includes built-in fulltext parser. And built-in
+ parser can be called even if the table has no fulltext indexes and
+ no varchar/text fields. */
+ if (! info->s->ftparsers)
+ {
+ /* It's ok that modification to shared structure is done w/o mutex
+ locks, because all threads would set the same variables to the
+ same values. */
+ uint i, j, keys= info->s->state.header.keys, ftparsers= 1;
+ for (i= 0; i < keys; i++)
+ {
+ MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ for (j= 0;; j++)
+ {
+ if (j == i)
+ {
+ keyinfo->ftparser_nr= ftparsers++;
+ break;
+ }
+ if (info->s->keyinfo[j].flag & HA_FULLTEXT &&
+ keyinfo->parser == info->s->keyinfo[j].parser)
+ {
+ keyinfo->ftparser_nr= info->s->keyinfo[j].ftparser_nr;
+ break;
+ }
+ }
+ }
+ }
+ info->s->ftparsers= ftparsers;
+ }
+ /*
+ We have to allocate two MYSQL_FTPARSER_PARAM structures per plugin
+ because in a boolean search a parser is called recursively
+ ftb_find_relevance* calls ftb_check_phrase*
+ (MAX_PARAM_NR=2)
+ */
+ info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
+ my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
+ info->s->ftparsers, MYF(MY_WME|MY_ZEROFILL));
+ init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+ if (! info->ftparser_param)
+ return 0;
+ }
+ if (keynr == NO_SUCH_KEY)
+ {
+ ftparser_nr= 0;
+ parser= &ft_default_parser;
+ }
+ else
+ {
+ ftparser_nr= info->s->keyinfo[keynr].ftparser_nr;
+ parser= info->s->keyinfo[keynr].parser;
+ }
+ DBUG_ASSERT(paramnr < MAX_PARAM_NR);
+ ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
+ if (! info->ftparser_param[ftparser_nr].mysql_add_word)
+ {
+ /* Note, that mysql_add_word is used here as a flag:
+ mysql_add_word == 0 - parser is not initialized
+ mysql_add_word != 0 - parser is initialized, or no
+ initialization needed. */
+ info->ftparser_param[ftparser_nr].mysql_add_word= (void *)1;
+ if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
+ return 0;
+ }
+ return &info->ftparser_param[ftparser_nr];
+}
+
+
+void maria_ftparser_call_deinitializer(MARIA_HA *info)
+{
+ uint i, j, keys= info->s->state.header.keys;
+ free_root(&info->ft_memroot, MYF(0));
+ if (! info->ftparser_param)
+ return;
+ for (i= 0; i < keys; i++)
+ {
+ MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
+ for (j=0; j < MAX_PARAM_NR; j++)
+ {
+ MYSQL_FTPARSER_PARAM *ftparser_param=
+ &info->ftparser_param[keyinfo->ftparser_nr*MAX_PARAM_NR + j];
+ if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
+ {
+ if (keyinfo->parser->deinit)
+ keyinfo->parser->deinit(ftparser_param);
+ ftparser_param->mysql_add_word= 0;
+ }
+ else
+ break;
+ }
+ }
+}
diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c
new file mode 100644
index 00000000000..06fc0b2df6c
--- /dev/null
+++ b/storage/maria/ma_ft_stem.c
@@ -0,0 +1,18 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* mulitingual stem */
diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c
new file mode 100644
index 00000000000..4c98e766234
--- /dev/null
+++ b/storage/maria/ma_ft_test1.c
@@ -0,0 +1,317 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_test1.h"
+#include <my_getopt.h>
+
+static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE;
+static uint key_length=200,extra_length=50;
+static int key_type=HA_KEYTYPE_TEXT;
+static int verbose=0,silent=0,skip_update=0,
+ no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0;
+static int create_flag=0,error=0;
+
+#define MAX_REC_LENGTH 300
+static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_record(char *, int);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+ {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+ MY_INIT(argv[0]);
+
+ get_options(argc,argv);
+ maria_init();
+
+ exit(run_test("FT1"));
+}
+
+static MARIA_COLUMNDEF recinfo[3];
+static MARIA_KEYDEF keyinfo[2];
+static HA_KEYSEG keyseg[10];
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ int i,j;
+ my_off_t pos;
+
+ bzero((char*) recinfo,sizeof(recinfo));
+
+ /* First define 2 columns */
+ recinfo[0].type=extra_field;
+ recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr :
+ extra_length);
+ if (extra_field == FIELD_VARCHAR)
+ recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length);
+ recinfo[1].type=key_field;
+ recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+ key_length);
+ if (key_field == FIELD_VARCHAR)
+ recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].seg[0].type= key_type;
+ keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART:
+ (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0;
+ keyinfo[0].seg[0].start=recinfo[0].length;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit= 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT);
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+ if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL,
+ (MARIA_CREATE_INFO*) 0, create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,0)))
+ goto err;
+
+ if (!silent)
+ printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing");
+ maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords);
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ my_errno=0;
+ for (i=NUPD ; i<NDATAS; i++ )
+ {
+ create_record(record,i);
+ error=maria_write(file,record);
+ if (verbose || error)
+ printf("I= %2d maria_write: %d errno: %d, record: %s\n",
+ i,error,my_errno,data[i].f0);
+ }
+
+ if (!skip_update)
+ {
+ if (!silent)
+ printf("- Updating rows\n");
+
+ /* Read through all rows and update them */
+ pos=(ha_rows) 0;
+ i=0;
+ while ((error=maria_rrnd(file,read_record,pos)) == 0)
+ {
+ create_record(record,NUPD-i-1);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update row: %.*s, error: %d\n",
+ keyinfo[0].seg[0].length,record,my_errno);
+ }
+ if(++i == NUPD) break;
+ pos=HA_OFFSET_ERROR;
+ }
+ if (i != NUPD)
+ printf("Found %d of %d rows\n", i,NUPD);
+ }
+
+ if (maria_close(file)) goto err;
+ if(no_search) return 0;
+ if (!silent)
+ printf("- Reopening file\n");
+ if (!(file=maria_open(filename,2,0))) goto err;
+ if (!silent)
+ printf("- Reading rows with key\n");
+ for (i=0 ; i < NQUERIES ; i++)
+ {
+ FT_DOCLIST *result;
+ result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1);
+ if(!result)
+ {
+ printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno);
+ continue;
+ }
+ printf("Query %d: `%s'. Found: %d. Top five documents:\n",
+ i,query[i],result->ndocs);
+ for (j=0;j<5;j++)
+ {
+ double w; int err;
+ err= maria_ft_nlq_read_next(result, read_record);
+ if (err==HA_ERR_END_OF_FILE)
+ {
+ printf("No more matches!\n");
+ break;
+ }
+ else if (err)
+ {
+ printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+ break;
+ }
+ w=maria_ft_nlq_get_relevance(result);
+ if (key_field == FIELD_VARCHAR)
+ {
+ uint l;
+ char *p;
+ p=recinfo[0].length+read_record;
+ l=uint2korr(p);
+ printf("%10.7f: %.*s\n",w,(int) l,p+2);
+ }
+ else
+ printf("%10.7f: %.*s\n",w,recinfo[1].length,
+ recinfo[0].length+read_record);
+ }
+ maria_ft_nlq_close_search(result);
+ }
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+static char blob_key[MAX_REC_LENGTH];
+/* static char blob_record[MAX_REC_LENGTH+20*20]; */
+
+void create_record(char *pos, int n)
+{
+ bzero((char*) pos,MAX_REC_LENGTH);
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;
+ strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length);
+ tmp=strlen(blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint tmp;
+ /* -1 is here because pack_length is stored in seg->length */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+ strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length);
+ tmp=strlen(pos+pack_length);
+ if (pack_length == 1)
+ *pos= (char) tmp;
+ else
+ int2store(pos,tmp);
+ pos+=recinfo[0].length;
+ }
+ else
+ {
+ strnmov(pos,data[n].f0,keyinfo[0].seg[0].length);
+ pos+=recinfo[0].length;
+ }
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;
+ strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length);
+ tmp=strlen(blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[1].length;
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ uint tmp;
+ /* -1 is here because pack_length is stored in seg->length */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+ strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length);
+ tmp=strlen(pos+1);
+ if (pack_length == 1)
+ *pos= (char) tmp;
+ else
+ int2store(pos,tmp);
+ pos+=recinfo[1].length;
+ }
+ else
+ {
+ strnmov(pos,data[n].f2,keyinfo[0].seg[0].length);
+ pos+=recinfo[1].length;
+ }
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch(optid) {
+ case 'v': verbose=1; break;
+ case 's': silent=1; break;
+ case 'F': no_fulltext=1; no_search=1;
+ case 'U': skip_update=1; break;
+ case 'K': no_keys=no_search=1; break;
+ case 'N': no_search=1; break;
+ case 'S': no_stopwords=1; break;
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case 'V':
+ case '?':
+ case 'h':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+/* Read options */
+
+static void get_options(int argc,char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("%s [options]\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h
new file mode 100644
index 00000000000..5883c42f5c5
--- /dev/null
+++ b/storage/maria/ma_ft_test1.h
@@ -0,0 +1,420 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define NUPD 20
+#define NDATAS 389
+struct { const char *f0, *f2; } data[NDATAS] = {
+ {"1", "General Information about MySQL"},
+ {"1.1", "What is MySQL?"},
+ {"1.2", "About this manual"},
+ {"1.3", "History of MySQL"},
+ {"1.4", "The main features of MySQL"},
+ {"1.5", "General SQL information and tutorials"},
+ {"1.6", "Useful MySQL-related links"},
+ {"1.7", "What are stored procedures and triggers and so on?"},
+ {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"},
+ {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"},
+ {"2.2", "Asking questions or reporting bugs"},
+ {"2.3", "I think I have found a bug. What information do you need to help me?"},
+ {"2.3.1", "MySQL keeps crashing"},
+ {"2.4", "Guidelines for answering questions on the mailing list"},
+ {"3", "Licensing or When do I have/want to pay for MySQL?"},
+ {"3.1", "How much does MySQL cost?"},
+ {"3.2", "How do I get commercial support?"},
+ {"3.2.1", "Types of commercial support"},
+ {"3.2.1.1", "Basic email support"},
+ {"3.2.1.2", "Extended email support"},
+/*------------------------------- NUPD=20 -------------------------------*/
+ {"3.2.1.3", "Asking: Login support"},
+ {"3.2.1.4", "Extended login support"},
+ {"3.3", "How do I pay for licenses/support?"},
+ {"3.4", "Who do I contact when I want more information about licensing/support?"},
+ {"3.5", "What Copyright does MySQL use?"},
+ {"3.6", "When may I distribute MySQL commercially without a fee?"},
+ {"3.7", "I want to sell a product that can be configured to use MySQL"},
+ {"3.8", "I am running a commercial web server using MySQL"},
+ {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"},
+ {"3.10", "Possible future changes in the licensing"},
+ {"4", "Compiling and installing MySQL"},
+ {"4.1", "How do I get MySQL?"},
+ {"4.2", "Which MySQL version should I use?"},
+ {"4.3", "How/when will you release updates?"},
+ {"4.4", "What operating systems does MySQL support?"},
+ {"4.5", "Compiling MySQL from source code"},
+ {"4.5.1", "Quick installation overview"},
+ {"4.5.2", "Usual configure switches"},
+ {"4.5.3", "Applying a patch"},
+ {"4.6", "Problems compiling?"},
+ {"4.7", "General compilation notes"},
+ {"4.8", "MIT-pthreads notes (FreeBSD)"},
+ {"4.9", "Perl installation comments"},
+ {"4.10", "Special things to consider for some machine/OS combinations"},
+ {"4.10.1", "Solaris notes"},
+ {"4.10.2", "SunOS 4 notes"},
+ {"4.10.3", "Linux notes for all versions"},
+ {"4.10.3.1", "Linux-x86 notes"},
+ {"4.10.3.2", "RedHat 5.0"},
+ {"4.10.3.3", "RedHat 5.1"},
+ {"4.10.3.4", "Linux-Sparc notes"},
+ {"4.10.3.5", "Linux-Alpha notes"},
+ {"4.10.3.6", "MkLinux notes"},
+ {"4.10.4", "Alpha-DEC-Unix notes"},
+ {"4.10.5", "Alpha-DEC-OSF1 notes"},
+ {"4.10.6", "SGI-IRIX notes"},
+ {"4.10.7", "FreeBSD notes"},
+ {"4.10.7.1", "FreeBSD-3.0 notes"},
+ {"4.10.8", "BSD/OS 2.# notes"},
+ {"4.10.8.1", "BSD/OS 3.# notes"},
+ {"4.10.9", "SCO notes"},
+ {"4.10.10", "SCO Unixware 7.0 notes"},
+ {"4.10.11", "IBM-AIX notes"},
+ {"4.10.12", "HP-UX notes"},
+ {"4.11", "TcX binaries"},
+ {"4.12", "Win32 notes"},
+ {"4.13", "Installation instructions for MySQL binary releases"},
+ {"4.13.1", "How to get MySQL Perl support working"},
+ {"4.13.2", "Linux notes"},
+ {"4.13.3", "HP-UX notes"},
+ {"4.13.4", "Linking client libraries"},
+ {"4.14", "Problems running mysql_install_db"},
+ {"4.15", "Problems starting MySQL"},
+ {"4.16", "Automatic start/stop of MySQL"},
+ {"4.17", "Option files"},
+ {"5", "How standards-compatible is MySQL?"},
+ {"5.1", "What extensions has MySQL to ANSI SQL92?"},
+ {"5.2", "What functionality is missing in MySQL?"},
+ {"5.2.1", "Sub-selects"},
+ {"5.2.2", "SELECT INTO TABLE"},
+ {"5.2.3", "Transactions"},
+ {"5.2.4", "Triggers"},
+ {"5.2.5", "Foreign Keys"},
+ {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"},
+ {"5.2.6", "Views"},
+ {"5.2.7", "-- as start of a comment"},
+ {"5.3", "What standards does MySQL follow?"},
+ {"5.4", "What functions exist only for compatibility?"},
+ {"5.5", "Limitations of BLOB and TEXT types"},
+ {"5.6", "How to cope without COMMIT-ROLLBACK"},
+ {"6", "The MySQL access privilege system"},
+ {"6.1", "What the privilege system does"},
+ {"6.2", "Connecting to the MySQL server"},
+ {"6.2.1", "Keeping your password secure"},
+ {"6.3", "Privileges provided by MySQL"},
+ {"6.4", "How the privilege system works"},
+ {"6.5", "The privilege tables"},
+ {"6.6", "Setting up the initial MySQL privileges"},
+ {"6.7", "Adding new user privileges to MySQL"},
+ {"6.8", "An example permission setup"},
+ {"6.9", "Causes of Access denied errors"},
+ {"6.10", "How to make MySQL secure against crackers"},
+ {"7", "MySQL language reference"},
+ {"7.1", "Literals: how to write strings and numbers"},
+ {"7.1.1", "Strings"},
+ {"7.1.2", "Numbers"},
+ {"7.1.3", "NULL values"},
+ {"7.1.4", "Database, table, index, column and alias names"},
+ {"7.1.4.1", "Case sensitivity in names"},
+ {"7.2", "Column types"},
+ {"7.2.1", "Column type storage requirements"},
+ {"7.2.5", "Numeric types"},
+ {"7.2.6", "Date and time types"},
+ {"7.2.6.1", "The DATE type"},
+ {"7.2.6.2", "The TIME type"},
+ {"7.2.6.3", "The DATETIME type"},
+ {"7.2.6.4", "The TIMESTAMP type"},
+ {"7.2.6.5", "The YEAR type"},
+ {"7.2.6.6", "Miscellaneous date and time properties"},
+ {"7.2.7", "String types"},
+ {"7.2.7.1", "The CHAR and VARCHAR types"},
+ {"7.2.7.2", "The BLOB and TEXT types"},
+ {"7.2.7.3", "The ENUM type"},
+ {"7.2.7.4", "The SET type"},
+ {"7.2.8", "Choosing the right type for a column"},
+ {"7.2.9", "Column indexes"},
+ {"7.2.10", "Multiple-column indexes"},
+ {"7.2.11", "Using column types from other database engines"},
+ {"7.3", "Functions for use in SELECT and WHERE clauses"},
+ {"7.3.1", "Grouping functions"},
+ {"7.3.2", "Normal arithmetic operations"},
+ {"7.3.3", "Bit functions"},
+ {"7.3.4", "Logical operations"},
+ {"7.3.5", "Comparison operators"},
+ {"7.3.6", "String comparison functions"},
+ {"7.3.7", "Control flow functions"},
+ {"7.3.8", "Mathematical functions"},
+ {"7.3.9", "String functions"},
+ {"7.3.10", "Date and time functions"},
+ {"7.3.11", "Miscellaneous functions"},
+ {"7.3.12", "Functions for use with GROUP BY clauses"},
+ {"7.4", "CREATE DATABASE syntax"},
+ {"7.5", "DROP DATABASE syntax"},
+ {"7.6", "CREATE TABLE syntax"},
+ {"7.7", "ALTER TABLE syntax"},
+ {"7.8", "OPTIMIZE TABLE syntax"},
+ {"7.9", "DROP TABLE syntax"},
+ {"7.10", "DELETE syntax"},
+ {"7.11", "SELECT syntax"},
+ {"7.12", "JOIN syntax"},
+ {"7.13", "INSERT syntax"},
+ {"7.14", "REPLACE syntax"},
+ {"7.15", "LOAD DATA INFILE syntax"},
+ {"7.16", "UPDATE syntax"},
+ {"7.17", "USE syntax"},
+ {"7.18", "SHOW syntax (Get information about tables, columns...)"},
+ {"7.19", "EXPLAIN syntax (Get information about a SELECT)"},
+ {"7.20", "DESCRIBE syntax (Get information about columns)"},
+ {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"},
+ {"7.22", "SET OPTION syntax"},
+ {"7.23", "GRANT syntax (Compatibility function)"},
+ {"7.24", "CREATE INDEX syntax (Compatibility function)"},
+ {"7.25", "DROP INDEX syntax (Compatibility function)"},
+ {"7.26", "Comment syntax"},
+ {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"},
+ {"7.28", "Is MySQL picky about reserved words?"},
+ {"8", "Example SQL queries"},
+ {"8.1", "Queries from twin project"},
+ {"8.1.1", "Find all non-distributed twins"},
+ {"8.1.2", "Show a table on twin pair status"},
+ {"9", "How safe/stable is MySQL?"},
+ {"9.1", "How stable is MySQL?"},
+ {"9.2", "Why are there is so many releases of MySQL?"},
+ {"9.3", "Checking a table for errors"},
+ {"9.4", "How to repair tables"},
+ {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"},
+ {"9.5.1", "Upgrading from a 3.21 version to 3.22"},
+ {"9.5.2", "Upgrading from a 3.20 version to 3.21"},
+ {"9.5.3", "Upgrading to another architecture"},
+ {"9.6", "Year 2000 compliance"},
+ {"10", "MySQL Server functions"},
+ {"10.1", "What languages are supported by MySQL?"},
+ {"10.1.1", "Character set used for data &#38; sorting"},
+ {"10.2", "The update log"},
+ {"10.3", "How big can MySQL tables be?"},
+ {"11", "Getting maximum performance from MySQL"},
+ {"11.1", "How does one change the size of MySQL buffers?"},
+ {"11.2", "How compiling and linking affects the speed of MySQL"},
+ {"11.3", "How does MySQL use memory?"},
+ {"11.4", "How does MySQL use indexes?"},
+ {"11.5", "What optimizations are done on WHERE clauses?"},
+ {"11.6", "How does MySQL open &#38; close tables?"},
+ {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"},
+ {"11.7", "How does MySQL lock tables?"},
+ {"11.8", "How should I arrange my table to be as fast/small as possible?"},
+ {"11.9", "What affects the speed of INSERT statements?"},
+ {"11.10", "What affects the speed DELETE statements?"},
+ {"11.11", "How do I get MySQL to run at full speed?"},
+ {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"},
+ {"11.13", "Why so many open tables?"},
+ {"12", "MySQL benchmark suite"},
+ {"13", "MySQL Utilites"},
+ {"13.1", "Overview of the different MySQL programs"},
+ {"13.2", "The MySQL table check, optimize and repair program"},
+ {"13.2.1", "isamchk memory use"},
+ {"13.2.2", "Getting low-level table information"},
+ {"13.3", "The MySQL compressed read-only table generator"},
+ {"14", "Adding new functions to MySQL"},
+ {"15", "MySQL ODBC Support"},
+ {"15.1", "Operating systems supported by MyODBC"},
+ {"15.2", "How to report problems with MyODBC"},
+ {"15.3", "Programs known to work with MyODBC"},
+ {"15.4", "How to fill in the various fields in the ODBC administrator program"},
+ {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"},
+ {"16", "Problems and common errors"},
+ {"16.1", "Some common errors when using MySQL"},
+ {"16.1.1", "MySQL server has gone away error"},
+ {"16.1.2", "Can't connect to local MySQL server error"},
+ {"16.1.3", "Out of memory error"},
+ {"16.1.4", "Packet too large error"},
+ {"16.1.5", "The table is full error"},
+ {"16.1.6", "Commands out of sync error in client"},
+ {"16.1.7", "Removing user error"},
+ {"16.2", "How MySQL handles a full disk"},
+ {"16.3", "How to run SQL commands from a text file"},
+ {"16.4", "Where MySQL stores temporary files"},
+ {"16.5", "Access denied error"},
+ {"16.6", "How to run MySQL as a normal user"},
+ {"16.7", "Problems with file permissions"},
+ {"16.8", "File not found"},
+ {"16.9", "Problems using DATE columns"},
+ {"16.10", "Case sensitivity in searches"},
+ {"16.11", "Problems with NULL values"},
+ {"17", "Solving some common problems with MySQL"},
+ {"17.1", "Database replication"},
+ {"17.2", "Database backups"},
+ {"18", "MySQL client tools and API's"},
+ {"18.1", "MySQL C API"},
+ {"18.2", "C API datatypes"},
+ {"18.3", "C API function overview"},
+ {"18.4", "C API function descriptions"},
+ {"18.4.1", "mysql_affected_rows()"},
+ {"18.4.2", "mysql_close()"},
+ {"18.4.3", "mysql_connect()"},
+ {"18.4.4", "mysql_create_db()"},
+ {"18.4.5", "mysql_data_seek()"},
+ {"18.4.6", "mysql_debug()"},
+ {"18.4.7", "mysql_drop_db()"},
+ {"18.4.8", "mysql_dump_debug_info()"},
+ {"18.4.9", "mysql_eof()"},
+ {"18.4.10", "mysql_errno()"},
+ {"18.4.11", "mysql_error()"},
+ {"18.4.12", "mysql_escape_string()"},
+ {"18.4.13", "mysql_fetch_field()"},
+ {"18.4.14", "mysql_fetch_fields()"},
+ {"18.4.15", "mysql_fetch_field_direct()"},
+ {"18.4.16", "mysql_fetch_lengths()"},
+ {"18.4.17", "mysql_fetch_row()"},
+ {"18.4.18", "mysql_field_seek()"},
+ {"18.4.19", "mysql_field_tell()"},
+ {"18.4.20", "mysql_free_result()"},
+ {"18.4.21", "mysql_get_client_info()"},
+ {"18.4.22", "mysql_get_host_info()"},
+ {"18.4.23", "mysql_get_proto_info()"},
+ {"18.4.24", "mysql_get_server_info()"},
+ {"18.4.25", "mysql_info()"},
+ {"18.4.26", "mysql_init()"},
+ {"18.4.27", "mysql_insert_id()"},
+ {"18.4.28", "mysql_kill()"},
+ {"18.4.29", "mysql_list_dbs()"},
+ {"18.4.30", "mysql_list_fields()"},
+ {"18.4.31", "mysql_list_processes()"},
+ {"18.4.32", "mysql_list_tables()"},
+ {"18.4.33", "mysql_num_fields()"},
+ {"18.4.34", "mysql_num_rows()"},
+ {"18.4.35", "mysql_query()"},
+ {"18.4.36", "mysql_real_connect()"},
+ {"18.4.37", "mysql_real_query()"},
+ {"18.4.38", "mysql_reload()"},
+ {"18.4.39", "mysql_row_tell()"},
+ {"18.4.40", "mysql_select_db()"},
+ {"18.4.41", "mysql_shutdown()"},
+ {"18.4.42", "mysql_stat()"},
+ {"18.4.43", "mysql_store_result()"},
+ {"18.4.44", "mysql_thread_id()"},
+ {"18.4.45", "mysql_use_result()"},
+ {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"},
+ {"18.4.47", "What results can I get from a query?"},
+ {"18.4.48", "How can I get the unique ID for the last inserted row?"},
+ {"18.4.49", "Problems linking with the C API"},
+ {"18.4.50", "How to make a thread-safe client"},
+ {"18.5", "MySQL Perl API's"},
+ {"18.5.1", "DBI with DBD::mysql"},
+ {"18.5.1.1", "The DBI interface"},
+ {"18.5.1.2", "More DBI/DBD information"},
+ {"18.6", "MySQL Java connectivity (JDBC)"},
+ {"18.7", "MySQL PHP API's"},
+ {"18.8", "MySQL C++ API's"},
+ {"18.9", "MySQL Python API's"},
+ {"18.10", "MySQL TCL API's"},
+ {"19", "How MySQL compares to other databases"},
+ {"19.1", "How MySQL compares to mSQL"},
+ {"19.1.1", "How to convert mSQL tools for MySQL"},
+ {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"},
+ {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"},
+ {"19.2", "How MySQL compares to PostgreSQL"},
+ {"A", "Some users of MySQL"},
+ {"B", "Contributed programs"},
+ {"C", "Contributors to MySQL"},
+ {"D", "MySQL change history"},
+ {"19.3", "Changes in release 3.22.x (Alpha version)"},
+ {"19.3.1", "Changes in release 3.22.7"},
+ {"19.3.2", "Changes in release 3.22.6"},
+ {"19.3.3", "Changes in release 3.22.5"},
+ {"19.3.4", "Changes in release 3.22.4"},
+ {"19.3.5", "Changes in release 3.22.3"},
+ {"19.3.6", "Changes in release 3.22.2"},
+ {"19.3.7", "Changes in release 3.22.1"},
+ {"19.3.8", "Changes in release 3.22.0"},
+ {"19.4", "Changes in release 3.21.x"},
+ {"19.4.1", "Changes in release 3.21.33"},
+ {"19.4.2", "Changes in release 3.21.32"},
+ {"19.4.3", "Changes in release 3.21.31"},
+ {"19.4.4", "Changes in release 3.21.30"},
+ {"19.4.5", "Changes in release 3.21.29"},
+ {"19.4.6", "Changes in release 3.21.28"},
+ {"19.4.7", "Changes in release 3.21.27"},
+ {"19.4.8", "Changes in release 3.21.26"},
+ {"19.4.9", "Changes in release 3.21.25"},
+ {"19.4.10", "Changes in release 3.21.24"},
+ {"19.4.11", "Changes in release 3.21.23"},
+ {"19.4.12", "Changes in release 3.21.22"},
+ {"19.4.13", "Changes in release 3.21.21a"},
+ {"19.4.14", "Changes in release 3.21.21"},
+ {"19.4.15", "Changes in release 3.21.20"},
+ {"19.4.16", "Changes in release 3.21.19"},
+ {"19.4.17", "Changes in release 3.21.18"},
+ {"19.4.18", "Changes in release 3.21.17"},
+ {"19.4.19", "Changes in release 3.21.16"},
+ {"19.4.20", "Changes in release 3.21.15"},
+ {"19.4.21", "Changes in release 3.21.14b"},
+ {"19.4.22", "Changes in release 3.21.14a"},
+ {"19.4.23", "Changes in release 3.21.13"},
+ {"19.4.24", "Changes in release 3.21.12"},
+ {"19.4.25", "Changes in release 3.21.11"},
+ {"19.4.26", "Changes in release 3.21.10"},
+ {"19.4.27", "Changes in release 3.21.9"},
+ {"19.4.28", "Changes in release 3.21.8"},
+ {"19.4.29", "Changes in release 3.21.7"},
+ {"19.4.30", "Changes in release 3.21.6"},
+ {"19.4.31", "Changes in release 3.21.5"},
+ {"19.4.32", "Changes in release 3.21.4"},
+ {"19.4.33", "Changes in release 3.21.3"},
+ {"19.4.34", "Changes in release 3.21.2"},
+ {"19.4.35", "Changes in release 3.21.0"},
+ {"19.5", "Changes in release 3.20.x"},
+ {"19.5.1", "Changes in release 3.20.18"},
+ {"19.5.2", "Changes in release 3.20.17"},
+ {"19.5.3", "Changes in release 3.20.16"},
+ {"19.5.4", "Changes in release 3.20.15"},
+ {"19.5.5", "Changes in release 3.20.14"},
+ {"19.5.6", "Changes in release 3.20.13"},
+ {"19.5.7", "Changes in release 3.20.11"},
+ {"19.5.8", "Changes in release 3.20.10"},
+ {"19.5.9", "Changes in release 3.20.9"},
+ {"19.5.10", "Changes in release 3.20.8"},
+ {"19.5.11", "Changes in release 3.20.7"},
+ {"19.5.12", "Changes in release 3.20.6"},
+ {"19.5.13", "Changes in release 3.20.3"},
+ {"19.5.14", "Changes in release 3.20.0"},
+ {"19.6", "Changes in release 3.19.x"},
+ {"19.6.1", "Changes in release 3.19.5"},
+ {"19.6.2", "Changes in release 3.19.4"},
+ {"19.6.3", "Changes in release 3.19.3"},
+ {"E", "Known errors and design deficiencies in MySQL"},
+ {"F", "List of things we want to add to MySQL in the future (The TODO)"},
+ {"19.7", "Things that must done in the real near future"},
+ {"19.8", "Things that have to be done sometime"},
+ {"19.9", "Some things we don't have any plans to do"},
+ {"G", "Comments on porting to other systems"},
+ {"19.10", "Debugging MySQL"},
+ {"19.11", "Comments about RTS threads"},
+ {"19.12", "What is the difference between different thread packages?"},
+ {"H", "Description of MySQL regular expression syntax"},
+ {"I", "What is Unireg?"},
+ {"J", "The MySQL server license"},
+ {"K", "The MySQL license for Microsoft operating systems"},
+ {"*", "SQL command, type and function index"},
+ {"*", "Concept Index"}
+};
+
+#define NQUERIES 5
+const char *query[NQUERIES]={
+ "mysql information and manual",
+ "upgrading from previous version",
+ "column indexes",
+ "against about after more right the with/without", /* stopwords test */
+ "mysql license and copyright"
+};
diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c
new file mode 100644
index 00000000000..f36147ccde2
--- /dev/null
+++ b/storage/maria/ma_ft_update.c
@@ -0,0 +1,352 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* functions to work with full-text indices */
+
+#include "ma_ftdefs.h"
+#include <math.h>
+
+void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record,
+ FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator_init");
+
+ ftsi->num=info->s->keyinfo[keynr].keysegs;
+ ftsi->seg=info->s->keyinfo[keynr].seg;
+ ftsi->rec=record;
+ DBUG_VOID_RETURN;
+}
+
+void _ma_ft_segiterator_dummy_init(const uchar *record, uint len,
+ FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator_dummy_init");
+
+ ftsi->num=1;
+ ftsi->seg=0;
+ ftsi->pos=record;
+ ftsi->len=len;
+ DBUG_VOID_RETURN;
+}
+
+/*
+ This function breaks convention "return 0 in success"
+ but it's easier to use like this
+
+ while(_ma_ft_segiterator())
+
+ so "1" means "OK", "0" means "EOF"
+*/
+
+uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator");
+
+ if (!ftsi->num)
+ DBUG_RETURN(0);
+
+ ftsi->num--;
+ if (!ftsi->seg)
+ DBUG_RETURN(1);
+
+ ftsi->seg--;
+
+ if (ftsi->seg->null_bit &&
+ (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit))
+ {
+ ftsi->pos=0;
+ DBUG_RETURN(1);
+ }
+ ftsi->pos= ftsi->rec+ftsi->seg->start;
+ if (ftsi->seg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= (ftsi->seg->bit_start);
+ ftsi->len= (pack_length == 1 ? (uint) *(uchar*) ftsi->pos :
+ uint2korr(ftsi->pos));
+ ftsi->pos+= pack_length; /* Skip VARCHAR length */
+ DBUG_RETURN(1);
+ }
+ if (ftsi->seg->flag & HA_BLOB_PART)
+ {
+ ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos);
+ memcpy_fixed((char*) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start,
+ sizeof(char*));
+ DBUG_RETURN(1);
+ }
+ ftsi->len=ftsi->seg->length;
+ DBUG_RETURN(1);
+}
+
+
+/* parses a document i.e. calls maria_ft_parse for every keyseg */
+
+uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record,
+ MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+ FT_SEG_ITERATOR ftsi;
+ struct st_mysql_ftparser *parser;
+ DBUG_ENTER("_ma_ft_parse");
+
+ _ma_ft_segiterator_init(info, keynr, record, &ftsi);
+
+ maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset);
+ parser= info->s->keyinfo[keynr].parser;
+ while (_ma_ft_segiterator(&ftsi))
+ {
+ if (ftsi.pos)
+ if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param,
+ mem_root))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record,
+ MEM_ROOT *mem_root)
+{
+ TREE ptree;
+ MYSQL_FTPARSER_PARAM *param;
+ DBUG_ENTER("_ma_ft_parserecord");
+ if (! (param= maria_ftparser_call_initializer(info, keynr, 0)))
+ DBUG_RETURN(NULL);
+ bzero((char*) &ptree, sizeof(ptree));
+ param->flags= 0;
+ if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root))
+ DBUG_RETURN(NULL);
+
+ DBUG_RETURN(maria_ft_linearize(&ptree, mem_root));
+}
+
+static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf,
+ FT_WORD *wlist, my_off_t filepos)
+{
+ uint key_length;
+ DBUG_ENTER("_ma_ft_store");
+
+ for (; wlist->pos; wlist++)
+ {
+ key_length= _ma_ft_make_key(info,keynr,keybuf,wlist,filepos);
+ if (_ma_ck_write(info, keynr, keybuf, key_length))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf,
+ FT_WORD *wlist, my_off_t filepos)
+{
+ uint key_length, err=0;
+ DBUG_ENTER("_ma_ft_erase");
+
+ for (; wlist->pos; wlist++)
+ {
+ key_length= _ma_ft_make_key(info,keynr,keybuf,wlist,filepos);
+ if (_ma_ck_delete(info, keynr, keybuf, key_length))
+ err=1;
+ }
+ DBUG_RETURN(err);
+}
+
+/*
+ Compares an appropriate parts of two WORD_KEY keys directly out of records
+ returns 1 if they are different
+*/
+
+#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1
+#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL 0
+
+int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2)
+{
+ FT_SEG_ITERATOR ftsi1, ftsi2;
+ CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+ DBUG_ENTER("_ma_ft_cmp");
+
+ _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1);
+ _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2);
+
+ while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2))
+ {
+ if ((ftsi1.pos != ftsi2.pos) &&
+ (!ftsi1.pos || !ftsi2.pos ||
+ ha_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len,
+ (uchar*) ftsi2.pos,ftsi2.len,0,0)))
+ DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
+ }
+ DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL);
+}
+
+
+/* update a document entry */
+
+int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+ const uchar *oldrec, const uchar *newrec, my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *oldlist,*newlist, *old_word, *new_word;
+ CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+ uint key_length;
+ int cmp, cmp2;
+ DBUG_ENTER("_ma_ft_update");
+
+ if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec,
+ &info->ft_memroot)) ||
+ !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec,
+ &info->ft_memroot)))
+ goto err;
+
+ error=0;
+ while(old_word->pos && new_word->pos)
+ {
+ cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len,
+ (uchar*) new_word->pos,new_word->len,0,0);
+ cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5);
+
+ if (cmp < 0 || cmp2)
+ {
+ key_length= _ma_ft_make_key(info,keynr,keybuf,old_word,pos);
+ if ((error= _ma_ck_delete(info,keynr, keybuf,key_length)))
+ goto err;
+ }
+ if (cmp > 0 || cmp2)
+ {
+ key_length= _ma_ft_make_key(info, keynr, keybuf, new_word,pos);
+ if ((error= _ma_ck_write(info, keynr, keybuf,key_length)))
+ goto err;
+ }
+ if (cmp<=0) old_word++;
+ if (cmp>=0) new_word++;
+ }
+ if (old_word->pos)
+ error= _ma_ft_erase(info,keynr,keybuf,old_word,pos);
+ else if (new_word->pos)
+ error= _ma_ft_store(info,keynr,keybuf,new_word,pos);
+
+err:
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_RETURN(error);
+}
+
+
+/* adds a document to the collection */
+
+int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+ my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *wlist;
+ DBUG_ENTER("_ma_ft_add");
+ DBUG_PRINT("enter",("keynr: %d",keynr));
+
+ if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+ error= _ma_ft_store(info,keynr,keybuf,wlist,pos);
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_PRINT("exit",("Return: %d",error));
+ DBUG_RETURN(error);
+}
+
+
+/* removes a document from the collection */
+
+int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+ my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *wlist;
+ DBUG_ENTER("_ma_ft_del");
+ DBUG_PRINT("enter",("keynr: %d",keynr));
+
+ if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+ error= _ma_ft_erase(info,keynr,keybuf,wlist,pos);
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_PRINT("exit",("Return: %d",error));
+ DBUG_RETURN(error);
+}
+
+
+uint _ma_ft_make_key(MARIA_HA *info, uint keynr, uchar *keybuf, FT_WORD *wptr,
+ my_off_t filepos)
+{
+ uchar buf[HA_FT_MAXBYTELEN+16];
+ DBUG_ENTER("_ma_ft_make_key");
+
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ {
+ float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight);
+ mi_float4store(buf,weight);
+ }
+#else
+#error
+#endif
+
+ int2store(buf+HA_FT_WLEN,wptr->len);
+ memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len);
+ DBUG_RETURN(_ma_make_key(info, keynr, keybuf, buf, filepos));
+}
+
+
+/*
+ convert key value to ft2
+*/
+
+uint _ma_ft_convert_to_ft2(MARIA_HA *info, uint keynr, uchar *key)
+{
+ my_off_t root;
+ DYNAMIC_ARRAY *da=info->ft1_to_ft2;
+ MARIA_KEYDEF *keyinfo=&info->s->ft2_keyinfo;
+ uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end;
+ uint length, key_length;
+ DBUG_ENTER("_ma_ft_convert_to_ft2");
+
+ /* we'll generate one pageful at once, and insert the rest one-by-one */
+ /* calculating the length of this page ...*/
+ length=(keyinfo->block_length-2) / keyinfo->keylength;
+ set_if_smaller(length, da->elements);
+ length=length * keyinfo->keylength;
+
+ get_key_full_length_rdonly(key_length, key);
+ while (_ma_ck_delete(info, keynr, key, key_length) == 0)
+ {
+ /*
+ nothing to do here.
+ _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys
+ */
+ }
+
+ /* creating pageful of keys */
+ maria_putint(info->buff,length+2,0);
+ memcpy(info->buff+2, key_ptr, length);
+ info->keyread_buff_used=info->page_changed=1; /* info->buff is used */
+ if ((root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR ||
+ _ma_write_keypage(info,keyinfo,root,DFLT_INIT_HITS,info->buff))
+ DBUG_RETURN(-1);
+
+ /* inserting the rest of key values */
+ end= (uchar*) dynamic_array_ptr(da, da->elements);
+ for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength)
+ if(_ma_ck_real_write_btree(info, keyinfo, key_ptr, 0, &root, SEARCH_SAME))
+ DBUG_RETURN(-1);
+
+ /* now, writing the word key entry */
+ ft_intXstore(key+key_length, - (int) da->elements);
+ _ma_dpointer(info, key+key_length+HA_FT_WLEN, root);
+
+ DBUG_RETURN(_ma_ck_real_write_btree(info,
+ info->s->keyinfo+keynr,
+ key, 0,
+ &info->s->state.key_root[keynr],
+ SEARCH_SAME));
+}
diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h
new file mode 100644
index 00000000000..5a7357e451c
--- /dev/null
+++ b/storage/maria/ma_ftdefs.h
@@ -0,0 +1,152 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "ma_fulltext.h"
+#include <m_ctype.h>
+#include <my_tree.h>
+#include <queues.h>
+#include <mysql/plugin.h>
+
+#define true_word_char(ctype, character) \
+ ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+ (character) == '_')
+#define misc_word_char(X) 0
+
+#define FT_MAX_WORD_LEN_FOR_SORT 31
+
+#define FTPARSER_MEMROOT_ALLOC_SIZE 65536
+
+#define COMPILE_STOPWORDS_IN
+
+/* Interested readers may consult SMART
+ (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
+ for an excellent implementation of vector space model we use.
+ It also demonstrate the usage of different weghting techniques.
+ This code, though, is completely original and is not based on the
+ SMART code but was in some cases inspired by it.
+
+ NORM_PIVOT was taken from the article
+ A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
+ ACM SIGIR'96, 21-29, 1996
+ */
+
+#define LWS_FOR_QUERY LWS_TF
+#define LWS_IN_USE LWS_LOG
+#define PRENORM_IN_USE PRENORM_AVG
+#define NORM_IN_USE NORM_PIVOT
+#define GWS_IN_USE GWS_PROB
+/*==============================================================*/
+#define LWS_TF (count)
+#define LWS_BINARY (count>0)
+#define LWS_SQUARE (count*count)
+#define LWS_LOG (count?(log( (double) count)+1):0)
+/*--------------------------------------------------------------*/
+#define PRENORM_NONE (p->weight)
+#define PRENORM_MAX (p->weight/docstat.max)
+#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max)
+#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq)
+#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
+/*--------------------------------------------------------------*/
+#define NORM_NONE (1)
+#define NORM_SUM (docstat.nsum)
+#define NORM_COS (sqrt(docstat.nsum2))
+
+#define PIVOT_VAL (0.0115)
+#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq)
+/*---------------------------------------------------------------*/
+#define GWS_NORM (1/sqrt(sum2))
+#define GWS_GFIDF (sum/doc_cnt)
+/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
+#define GWS_IDF log(aio->info->state->records/doc_cnt)
+#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt)
+#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
+#define GWS_FREQ (1.0/doc_cnt)
+#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
+#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3)
+#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
+/*=================================================================*/
+
+/* Boolean search operators */
+#define FTB_YES (ft_boolean_syntax[0])
+#define FTB_EGAL (ft_boolean_syntax[1])
+#define FTB_NO (ft_boolean_syntax[2])
+#define FTB_INC (ft_boolean_syntax[3])
+#define FTB_DEC (ft_boolean_syntax[4])
+#define FTB_LBR (ft_boolean_syntax[5])
+#define FTB_RBR (ft_boolean_syntax[6])
+#define FTB_NEG (ft_boolean_syntax[7])
+#define FTB_TRUNC (ft_boolean_syntax[8])
+#define FTB_LQUOT (ft_boolean_syntax[10])
+#define FTB_RQUOT (ft_boolean_syntax[11])
+
+typedef struct st_maria_ft_word {
+ uchar * pos;
+ uint len;
+ double weight;
+} FT_WORD;
+
+int is_stopword(char *word, uint len);
+
+uint _ma_ft_make_key(MARIA_HA *, uint , uchar *, FT_WORD *, my_off_t);
+
+uchar maria_ft_get_word(CHARSET_INFO *, uchar **, uchar *, FT_WORD *,
+ MYSQL_FTPARSER_BOOLEAN_INFO *);
+uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *,
+ FT_WORD *, my_bool);
+
+typedef struct _st_maria_ft_seg_iterator {
+ uint num, len;
+ HA_KEYSEG *seg;
+ const uchar *rec, *pos;
+} FT_SEG_ITERATOR;
+
+void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *);
+void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
+uint _ma_ft_segiterator(FT_SEG_ITERATOR *);
+
+void maria_ft_parse_init(TREE *, CHARSET_INFO *);
+int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser,
+ MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *);
+FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *);
+uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *,
+ MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint, uchar *);
+FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint, CHARSET_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_nlq;
+int maria_ft_nlq_read_next(FT_INFO *, char *);
+float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_nlq_close_search(FT_INFO *);
+float maria_ft_nlq_get_relevance(FT_INFO *);
+my_off_t maria_ft_nlq_get_docid(FT_INFO *);
+void maria_ft_nlq_reinit_search(FT_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_boolean;
+int maria_ft_boolean_read_next(FT_INFO *, char *);
+float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_boolean_close_search(FT_INFO *);
+float maria_ft_boolean_get_relevance(FT_INFO *);
+my_off_t maria_ft_boolean_get_docid(FT_INFO *);
+void maria_ft_boolean_reinit_search(FT_INFO *);
+extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+ uint keynr,
+ uint paramnr);
+extern void maria_ftparser_call_deinitializer(MARIA_HA *info);
diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h
new file mode 100644
index 00000000000..dc6cf9d1204
--- /dev/null
+++ b/storage/maria/ma_fulltext.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "maria_def.h"
+#include "ft_global.h"
+
+int _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *);
+int _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+int _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+
+uint _ma_ft_convert_to_ft2(MARIA_HA *, uint, uchar *);
diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c
new file mode 100644
index 00000000000..4aecc33f816
--- /dev/null
+++ b/storage/maria/ma_info.c
@@ -0,0 +1,141 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Return useful base information for an open table */
+
+#include "maria_def.h"
+#ifdef __WIN__
+#include <sys/stat.h>
+#endif
+
+ /* Get position to last record */
+
+MARIA_RECORD_POS maria_position(MARIA_HA *info)
+{
+ return info->cur_row.lastpos;
+}
+
+
+/* Get information about the table */
+/* if flag == 2 one get current info (no sync from database */
+
+int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag)
+{
+ MY_STAT state;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_status");
+
+ x->recpos= info->cur_row.lastpos;
+ if (flag == HA_STATUS_POS)
+ DBUG_RETURN(0); /* Compatible with ISAM */
+ if (!(flag & HA_STATUS_NO_LOCK))
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ VOID(_ma_readinfo(info,F_RDLCK,0));
+ fast_ma_writeinfo(info);
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ if (flag & HA_STATUS_VARIABLE)
+ {
+ x->records = info->state->records;
+ x->deleted = info->state->del;
+ x->delete_length = info->state->empty;
+ x->data_file_length =info->state->data_file_length;
+ x->index_file_length=info->state->key_file_length;
+
+ x->keys = share->state.header.keys;
+ x->check_time = share->state.check_time;
+ x->mean_reclength = x->records ?
+ (ulong) ((x->data_file_length - x->delete_length) /x ->records) :
+ (ulong) share->min_pack_length;
+ }
+ if (flag & HA_STATUS_ERRKEY)
+ {
+ x->errkey= info->errkey;
+ x->dup_key_pos= info->dup_key_pos;
+ }
+ if (flag & HA_STATUS_CONST)
+ {
+ x->reclength = share->base.reclength;
+ x->max_data_file_length=share->base.max_data_file_length;
+ x->max_index_file_length=info->s->base.max_key_file_length;
+ x->filenr = info->dfile.file;
+ x->options = share->options;
+ x->create_time=share->state.create_time;
+ x->reflength= maria_get_pointer_length(share->base.max_data_file_length,
+ maria_data_pointer_size);
+ x->record_offset= (info->s->data_file_type == STATIC_RECORD ?
+ share->base.pack_reclength: 0);
+ x->sortkey= -1; /* No clustering */
+ x->rec_per_key = share->state.rec_per_key_part;
+ x->key_map = share->state.key_map;
+ x->data_file_name = share->data_file_name;
+ x->index_file_name = share->index_file_name;
+ x->data_file_type = share->data_file_type;
+ }
+ if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0)))
+ x->update_time=state.st_mtime;
+ else
+ x->update_time=0;
+ if (flag & HA_STATUS_AUTO)
+ {
+ x->auto_increment= share->state.auto_increment+1;
+ if (!x->auto_increment) /* This shouldn't happen */
+ x->auto_increment= ~(ulonglong) 0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write a message to the error log.
+
+ SYNOPSIS
+ _ma_report_error()
+ file_name Name of table file (e.g. index_file_name).
+ errcode Error number.
+
+ DESCRIPTION
+ This function supplies my_error() with a table name. Most error
+ messages need one. Since string arguments in error messages are limited
+ to 64 characters by convention, we ensure that in case of truncation,
+ that the end of the index file path is in the message. This contains
+ the most valuable information (the table name and the database name).
+
+ RETURN
+ void
+*/
+
+void _ma_report_error(int errcode, const char *file_name)
+{
+ uint length;
+ DBUG_ENTER("_ma_report_error");
+ DBUG_PRINT("enter",("errcode %d, table '%s'", errcode, file_name));
+
+ if ((length= strlen(file_name)) > 64)
+ {
+ /* we first remove the directory */
+ uint dir_length= dirname_length(file_name);
+ file_name+= dir_length;
+ if ((length-= dir_length) > 64)
+ {
+ /* still too long, chop start of table name */
+ file_name+= length - 64;
+ }
+ }
+
+ my_error(errcode, MYF(ME_NOREFRESH), file_name);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c
new file mode 100644
index 00000000000..fb8efddd778
--- /dev/null
+++ b/storage/maria/ma_init.c
@@ -0,0 +1,67 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Initialize an maria-database */
+
+#include "maria_def.h"
+#include <ft_global.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+#include "ma_checkpoint.h"
+
+my_bool maria_inited= FALSE;
+pthread_mutex_t THR_LOCK_maria;
+
+/*
+ Initialize maria
+
+ SYNOPSIS
+ maria_init()
+
+ TODO
+ Open log files and do recovery if need
+
+ RETURN
+ 0 ok
+ # error number
+*/
+
+int maria_init(void)
+{
+ if (!maria_inited)
+ {
+ maria_inited= TRUE;
+ pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW);
+ _ma_init_block_record_data();
+ my_handler_error_register();
+ }
+ return 0;
+}
+
+
+void maria_end(void)
+{
+ if (maria_inited)
+ {
+ maria_inited= maria_multi_threaded= FALSE;
+ ft_free_stopwords();
+ ma_checkpoint_end();
+ trnman_destroy();
+ translog_destroy();
+ end_pagecache(maria_log_pagecache, TRUE);
+ ma_control_file_end();
+ pthread_mutex_destroy(&THR_LOCK_maria);
+ }
+}
diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c
new file mode 100644
index 00000000000..96b8d2af0eb
--- /dev/null
+++ b/storage/maria/ma_key.c
@@ -0,0 +1,569 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Functions to handle keys */
+
+#include "maria_def.h"
+#include "m_ctype.h"
+#include "ma_sp_defs.h"
+#ifdef HAVE_IEEEFP_H
+#include <ieeefp.h>
+#endif
+
+#define CHECK_KEYS /* Enable safety checks */
+
+#define FIX_LENGTH(cs, pos, length, char_length) \
+ do { \
+ if (length > char_length) \
+ char_length= my_charpos(cs, pos, pos+length, char_length); \
+ set_if_smaller(char_length,length); \
+ } while(0)
+
+static int _ma_put_key_in_record(MARIA_HA *info,uint keynr,uchar *record);
+
+/*
+ Make a intern key from a record
+
+ SYNOPSIS
+ _ma_make_key()
+ info MyiSAM handler
+ keynr key number
+ key Store created key here
+ record Record
+ filepos Position to record in the data file
+
+ RETURN
+ Length of key
+*/
+
+uint _ma_make_key(register MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *record, MARIA_RECORD_POS filepos)
+{
+ const uchar *pos;
+ uchar *start;
+ reg1 HA_KEYSEG *keyseg;
+ my_bool is_ft= info->s->keyinfo[keynr].flag & HA_FULLTEXT;
+ DBUG_ENTER("_ma_make_key");
+
+ if (info->s->keyinfo[keynr].flag & HA_SPATIAL)
+ {
+ /*
+ TODO: nulls processing
+ */
+#ifdef HAVE_SPATIAL
+ DBUG_RETURN(_ma_sp_make_key(info,keynr, key,record,filepos));
+#else
+ DBUG_ASSERT(0); /* maria_open should check that this never happens*/
+#endif
+ }
+
+ start=key;
+ for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint length=keyseg->length;
+ uint char_length;
+ CHARSET_INFO *cs=keyseg->charset;
+
+ if (keyseg->null_bit)
+ {
+ if (record[keyseg->null_pos] & keyseg->null_bit)
+ {
+ *key++= 0; /* NULL in key */
+ continue;
+ }
+ *key++=1; /* Not NULL */
+ }
+
+ char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+ length);
+
+ pos= record+keyseg->start;
+ if (type == HA_KEYTYPE_BIT)
+ {
+ if (keyseg->bit_length)
+ {
+ uchar bits= get_rec_bits((uchar*) record + keyseg->bit_pos,
+ keyseg->bit_start, keyseg->bit_length);
+ *key++= (char) bits;
+ length--;
+ }
+ memcpy(key, pos, length);
+ key+= length;
+ continue;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ if (type != HA_KEYTYPE_NUM)
+ {
+ length= cs->cset->lengthsp(cs, pos, length);
+ }
+ else
+ {
+ const uchar *end= pos + length;
+ while (pos < end && pos[0] == ' ')
+ pos++;
+ length= (uint) (end-pos);
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key, pos, (size_t) char_length);
+ key+=char_length;
+ continue;
+ }
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= (keyseg->bit_start == 1 ? 1 : 2);
+ uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos :
+ uint2korr(pos));
+ pos+= pack_length; /* Skip VARCHAR length */
+ set_if_smaller(length,tmp_length);
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key,pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+ memcpy_fixed(&pos,pos+keyseg->bit_start,sizeof(char*));
+ set_if_smaller(length,tmp_length);
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key,pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ { /* Numerical column */
+#ifdef HAVE_ISNAN
+ if (type == HA_KEYTYPE_FLOAT)
+ {
+ float nr;
+ float4get(nr,pos);
+ if (isnan(nr))
+ {
+ /* Replace NAN with zero */
+ bzero(key,length);
+ key+=length;
+ continue;
+ }
+ }
+ else if (type == HA_KEYTYPE_DOUBLE)
+ {
+ double nr;
+ float8get(nr,pos);
+ if (isnan(nr))
+ {
+ bzero(key,length);
+ key+=length;
+ continue;
+ }
+ }
+#endif
+ pos+=length;
+ while (length--)
+ {
+ *key++ = *--pos;
+ }
+ continue;
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ memcpy(key, pos, char_length);
+ if (length > char_length)
+ cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+ key+= length;
+ }
+ _ma_dpointer(info,key,filepos);
+ DBUG_PRINT("exit",("keynr: %d",keynr));
+ DBUG_DUMP("key",start,(uint) (key-start)+keyseg->length);
+ DBUG_EXECUTE("key",
+ _ma_print_key(DBUG_FILE,info->s->keyinfo[keynr].seg,start,
+ (uint) (key-start)););
+ DBUG_RETURN((uint) (key-start)); /* Return keylength */
+} /* _ma_make_key */
+
+
+/*
+ Pack a key to intern format from given format (c_rkey)
+
+ SYNOPSIS
+ _ma_pack_key()
+ info MARIA handler
+ uint keynr key number
+ key Store packed key here
+ old Not packed key
+ keypart_map bitmap of used keyparts
+ last_used_keyseg out parameter. May be NULL
+
+ RETURN
+ length of packed key
+
+ last_use_keyseg Store pointer to the keyseg after the last used one
+*/
+
+uint _ma_pack_key(register MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *old, key_part_map keypart_map,
+ HA_KEYSEG **last_used_keyseg)
+{
+ uchar *start_key=key;
+ HA_KEYSEG *keyseg;
+ my_bool is_ft= info->s->keyinfo[keynr].flag & HA_FULLTEXT;
+ DBUG_ENTER("_ma_pack_key");
+
+ /* "one part" rtree key is 2*SPDIMS part key in Maria */
+ if (info->s->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE)
+ keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1;
+
+ /* only key prefixes are supported */
+ DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0);
+
+ for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type && keypart_map;
+ old+= keyseg->length, keyseg++)
+ {
+ enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type;
+ uint length= keyseg->length;
+ uint char_length;
+ const uchar *pos;
+ CHARSET_INFO *cs=keyseg->charset;
+
+ keypart_map>>= 1;
+ if (keyseg->null_bit)
+ {
+ if (!(*key++= (char) 1-*old++)) /* Copy null marker */
+ {
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ old+= 2;
+ continue; /* Found NULL */
+ }
+ }
+ char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+ length);
+ pos= old;
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ const uchar *end= pos + length;
+ if (type == HA_KEYTYPE_NUM)
+ {
+ while (pos < end && pos[0] == ' ')
+ pos++;
+ }
+ else if (type != HA_KEYTYPE_BINARY)
+ {
+ while (end > pos && end[-1] == ' ')
+ end--;
+ }
+ length=(uint) (end-pos);
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key,pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ {
+ /* Length of key-part used with maria_rkey() always 2 */
+ uint tmp_length=uint2korr(pos);
+ pos+=2;
+ set_if_smaller(length,tmp_length); /* Safety */
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ old+=2; /* Skip length */
+ memcpy(key, pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ { /* Numerical column */
+ pos+=length;
+ while (length--)
+ *key++ = *--pos;
+ continue;
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ memcpy(key, pos, char_length);
+ if (length > char_length)
+ cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+ key+= length;
+ }
+ if (last_used_keyseg)
+ *last_used_keyseg= keyseg;
+
+ DBUG_PRINT("exit", ("length: %u", (uint) (key-start_key)));
+ DBUG_RETURN((uint) (key-start_key));
+} /* _ma_pack_key */
+
+
+
+/*
+ Store found key in record
+
+ SYNOPSIS
+ _ma_put_key_in_record()
+ info MARIA handler
+ keynr Key number that was used
+ record Store key here
+
+ Last read key is in info->lastkey
+
+ NOTES
+ Used when only-keyread is wanted
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr,
+ uchar *record)
+{
+ reg2 uchar *key;
+ uchar *pos,*key_end;
+ reg1 HA_KEYSEG *keyseg;
+ uchar *blob_ptr;
+ DBUG_ENTER("_ma_put_key_in_record");
+
+ blob_ptr= info->lastkey2; /* Place to put blob parts */
+ key=info->lastkey; /* KEy that was read */
+ key_end=key+info->lastkey_length;
+ for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->null_bit)
+ {
+ if (!*key++)
+ {
+ record[keyseg->null_pos]|= keyseg->null_bit;
+ continue;
+ }
+ record[keyseg->null_pos]&= ~keyseg->null_bit;
+ }
+ if (keyseg->type == HA_KEYTYPE_BIT)
+ {
+ uint length= keyseg->length;
+
+ if (keyseg->bit_length)
+ {
+ uchar bits= *key++;
+ set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start,
+ keyseg->bit_length);
+ length--;
+ }
+ else
+ {
+ clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start,
+ keyseg->bit_length);
+ }
+ memcpy(record + keyseg->start, key, length);
+ key+= length;
+ continue;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ pos= record+keyseg->start;
+ if (keyseg->type != (int) HA_KEYTYPE_NUM)
+ {
+ memcpy(pos,key,(size_t) length);
+ keyseg->charset->cset->fill(keyseg->charset,
+ pos + length, keyseg->length - length,
+ ' ');
+ }
+ else
+ {
+ bfill(pos,keyseg->length-length,' ');
+ memcpy(pos+keyseg->length-length,key,(size_t) length);
+ }
+ key+=length;
+ continue;
+ }
+
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ /* Store key length */
+ if (keyseg->bit_start == 1)
+ *(uchar*) (record+keyseg->start)= (uchar) length;
+ else
+ int2store(record+keyseg->start, length);
+ /* And key data */
+ memcpy(record+keyseg->start + keyseg->bit_start, key, length);
+ key+= length;
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ memcpy(record+keyseg->start+keyseg->bit_start,
+ (char*) &blob_ptr,sizeof(char*));
+ memcpy(blob_ptr,key,length);
+ blob_ptr+=length;
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ _ma_store_blob_length(record+keyseg->start,
+ (uint) keyseg->bit_start,length);
+ key+=length;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ {
+ uchar *to= record+keyseg->start+keyseg->length;
+ uchar *end= key+keyseg->length;
+#ifdef CHECK_KEYS
+ if (end > key_end)
+ goto err;
+#endif
+ do
+ {
+ *--to= *key++;
+ } while (key != end);
+ continue;
+ }
+ else
+ {
+#ifdef CHECK_KEYS
+ if (key+keyseg->length > key_end)
+ goto err;
+#endif
+ memcpy(record+keyseg->start, key, (size_t) keyseg->length);
+ key+= keyseg->length;
+ }
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1); /* Crashed row */
+} /* _ma_put_key_in_record */
+
+
+ /* Here when key reads are used */
+
+int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ fast_ma_writeinfo(info);
+ if (filepos != HA_OFFSET_ERROR)
+ {
+ if (info->lastinx >= 0)
+ { /* Read only key */
+ if (_ma_put_key_in_record(info,(uint) info->lastinx,buf))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return -1;
+ }
+ info->update|= HA_STATE_AKTIV; /* We should find a record */
+ return 0;
+ }
+ my_errno=HA_ERR_WRONG_INDEX;
+ }
+ return(-1); /* Wrong data to read */
+}
+
+
+/*
+ Retrieve auto_increment info
+
+ SYNOPSIS
+ retrieve_auto_increment()
+ info Maria handler
+ record Row to update
+
+ IMPLEMENTATION
+ For signed columns we don't retrieve the auto increment value if it's
+ less than zero.
+*/
+
+ulonglong ma_retrieve_auto_increment(MARIA_HA *info,const uchar *record)
+{
+ ulonglong value= 0; /* Store unsigned values here */
+ longlong s_value= 0; /* Store signed values here */
+ HA_KEYSEG *keyseg= info->s->keyinfo[info->s->base.auto_key-1].seg;
+ const uchar *key= record + keyseg->start;
+
+ switch (keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ s_value= (longlong) *(char*)key;
+ break;
+ case HA_KEYTYPE_BINARY:
+ value=(ulonglong) *(uchar*) key;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ s_value= (longlong) sint2korr(key);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ value=(ulonglong) uint2korr(key);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ s_value= (longlong) sint4korr(key);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ value=(ulonglong) uint4korr(key);
+ break;
+ case HA_KEYTYPE_INT24:
+ s_value= (longlong) sint3korr(key);
+ break;
+ case HA_KEYTYPE_UINT24:
+ value=(ulonglong) uint3korr(key);
+ break;
+ case HA_KEYTYPE_FLOAT: /* This shouldn't be used */
+ {
+ float f_1;
+ float4get(f_1,key);
+ /* Ignore negative values */
+ value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1;
+ break;
+ }
+ case HA_KEYTYPE_DOUBLE: /* This shouldn't be used */
+ {
+ double f_1;
+ float8get(f_1,key);
+ /* Ignore negative values */
+ value = (f_1 < 0.0) ? 0 : (ulonglong) f_1;
+ break;
+ }
+ case HA_KEYTYPE_LONGLONG:
+ s_value= sint8korr(key);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ value= uint8korr(key);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ value=0; /* Error */
+ break;
+ }
+
+ /*
+ The following code works becasue if s_value < 0 then value is 0
+ and if s_value == 0 then value will contain either s_value or the
+ correct value.
+ */
+ return (s_value > 0) ? (ulonglong) s_value : value;
+}
diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c
new file mode 100644
index 00000000000..7a2a56488e6
--- /dev/null
+++ b/storage/maria/ma_keycache.c
@@ -0,0 +1,163 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Key cache assignments
+*/
+
+#include "maria_def.h"
+
+/*
+ Assign pages of the index file for a table to a key cache
+
+ SYNOPSIS
+ maria_assign_to_pagecache()
+ info open table
+ key_map map of indexes to assign to the key cache
+ pagecache_ptr pointer to the key cache handle
+ assign_lock Mutex to lock during assignment
+
+ PREREQUESTS
+ One must have a READ lock or a WRITE lock on the table when calling
+ the function to ensure that there is no other writers to it.
+
+ The caller must also ensure that one doesn't call this function from
+ two different threads with the same table.
+
+ NOTES
+ At present pages for all indexes must be assigned to the same key cache.
+ In future only pages for indexes specified in the key_map parameter
+ of the table will be assigned to the specified key cache.
+
+ RETURN VALUE
+ 0 If a success
+ # Error code
+*/
+
+int maria_assign_to_pagecache(MARIA_HA *info,
+ ulonglong key_map __attribute__((unused)),
+ PAGECACHE *pagecache)
+{
+ int error= 0;
+ MARIA_SHARE* share= info->s;
+ DBUG_ENTER("maria_assign_to_pagecache");
+ DBUG_PRINT("enter",
+ ("old_pagecache_handle: 0x%lx new_pagecache_handle: 0x%lx",
+ (long) share->pagecache, (long) pagecache));
+
+ /*
+ Skip operation if we didn't change key cache. This can happen if we
+ call this for all open instances of the same table
+ */
+ if (share->pagecache == pagecache)
+ DBUG_RETURN(0);
+
+ /*
+ First flush all blocks for the table in the old key cache.
+ This is to ensure that the disk is consistent with the data pages
+ in memory (which may not be the case if the table uses delayed_key_write)
+
+ Note that some other read thread may still fill in the key cache with
+ new blocks during this call and after, but this doesn't matter as
+ all threads will start using the new key cache for their next call to
+ maria library and we know that there will not be any changed blocks
+ in the old key cache.
+ */
+
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+ {
+ error= my_errno;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Mark that table must be checked */
+ }
+
+ /*
+ Flush the new key cache for this file. This is needed to ensure
+ that there is no old blocks (with outdated data) left in the new key
+ cache from an earlier assign_to_keycache operation
+
+ (This can never fail as there is never any not written data in the
+ new key cache)
+ */
+ (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE);
+
+ /*
+ ensure that setting the key cache and changing the multi_pagecache
+ is done atomicly
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Tell all threads to use the new key cache
+ This should be seen at the lastes for the next call to an maria function.
+ */
+ share->pagecache= pagecache;
+
+ /* store the key cache in the global hash structure for future opens */
+ if (multi_pagecache_set(share->unique_file_name, share->unique_name_length,
+ share->pagecache))
+ error= my_errno;
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Change all MARIA entries that uses one key cache to another key cache
+
+ SYNOPSIS
+ maria_change_pagecache()
+ old_pagecache Old key cache
+ new_pagecache New key cache
+
+ NOTES
+ This is used when we delete one key cache.
+
+ To handle the case where some other threads tries to open an MARIA
+ table associated with the to-be-deleted key cache while this operation
+ is running, we have to call 'multi_pagecache_change()' from this
+ function while we have a lock on the MARIA table list structure.
+
+ This is safe as long as it's only MARIA that is using this specific
+ key cache.
+*/
+
+
+void maria_change_pagecache(PAGECACHE *old_pagecache,
+ PAGECACHE *new_pagecache)
+{
+ LIST *pos;
+ DBUG_ENTER("maria_change_pagecache");
+
+ /*
+ Lock list to ensure that no one can close the table while we manipulate it
+ */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*) pos->data;
+ MARIA_SHARE *share= info->s;
+ if (share->pagecache == old_pagecache)
+ maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache);
+ }
+
+ /*
+ We have to do the following call while we have the lock on the
+ MARIA list structure to ensure that another thread is not trying to
+ open a new table that will be associted with the old key cache
+ */
+ multi_pagecache_change(old_pagecache, new_pagecache);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c
new file mode 100644
index 00000000000..01d59ed56df
--- /dev/null
+++ b/storage/maria/ma_locking.c
@@ -0,0 +1,570 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ locking of isam-tables.
+ reads info from a isam-table. Must be first request before doing any furter
+ calls to any isamfunktion. Is used to allow many process use the same
+ isamdatabase.
+*/
+
+#include "ma_ftdefs.h"
+
+ /* lock table by F_UNLCK, F_RDLCK or F_WRLCK */
+
+int maria_lock_database(MARIA_HA *info, int lock_type)
+{
+ int error;
+ uint count;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_lock_database");
+ DBUG_PRINT("enter",("lock_type: %d old lock %d r_locks: %u w_locks: %u "
+ "global_changed: %d open_count: %u name: '%s'",
+ lock_type, info->lock_type, share->r_locks,
+ share->w_locks,
+ share->global_changed, share->state.open_count,
+ share->index_file_name));
+ if (share->options & HA_OPTION_READ_ONLY_DATA ||
+ info->lock_type == lock_type)
+ DBUG_RETURN(0);
+ if (lock_type == F_EXTRA_LCK) /* Used by TMP tables */
+ {
+ ++share->w_locks;
+ ++share->tot_locks;
+ info->lock_type= lock_type;
+ DBUG_RETURN(0);
+ }
+
+ error=0;
+ pthread_mutex_lock(&share->intern_lock);
+ if (share->kfile.file >= 0) /* May only be false on windows */
+ {
+ switch (lock_type) {
+ case F_UNLCK:
+ maria_ftparser_call_deinitializer(info);
+ if (info->lock_type == F_RDLCK)
+ {
+ count= --share->r_locks;
+ _ma_restore_status(info);
+ }
+ else
+ {
+ count= --share->w_locks;
+ _ma_update_status(info);
+ }
+ --share->tot_locks;
+ if (info->lock_type == F_WRLCK && !share->w_locks)
+ {
+ if (!share->delay_key_write &&
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_KEEP))
+ {
+ error= my_errno;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ /* Mark that table must be checked */
+ maria_mark_crashed(info);
+ }
+ /* pages of transactional tables get flushed at Checkpoint */
+ if (!share->base.born_transactional &&
+ _ma_flush_table_files(info, MARIA_FLUSH_DATA,
+ FLUSH_KEEP, FLUSH_KEEP))
+ error= my_errno;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ if (end_io_cache(&info->rec_cache))
+ {
+ error=my_errno;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ }
+ if (!count)
+ {
+ DBUG_PRINT("info",("changed: %u w_locks: %u",
+ (uint) share->changed, share->w_locks));
+ if (share->changed && !share->w_locks)
+ {
+#ifdef HAVE_MMAP
+ if ((info->s->mmaped_length !=
+ info->s->state.state.data_file_length) &&
+ (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+ {
+ if (info->s->concurrent_insert)
+ rw_wrlock(&info->s->mmap_lock);
+ _ma_remap_file(info, info->s->state.state.data_file_length);
+ info->s->nonmmaped_inserts= 0;
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ }
+#endif
+ share->state.process= share->last_process=share->this_process;
+ share->state.unique= info->last_unique= info->this_unique;
+ share->state.update_count= info->last_loop= ++info->this_loop;
+ /* transactional tables rather flush their state at Checkpoint */
+ if (!share->base.born_transactional)
+ {
+ if (_ma_state_info_write_sub(share->kfile.file, &share->state, 1))
+ error= my_errno;
+ else
+ {
+ /* A value of 0 means below means "state flushed" */
+ share->changed= 0;
+ }
+ }
+ if (maria_flush)
+ {
+ if (_ma_sync_table_files(info))
+ error= my_errno;
+ }
+ else
+ share->not_flushed=1;
+ if (error)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ }
+ }
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ info->lock_type= F_UNLCK;
+ /* verify that user of the table cleaned up after itself */
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ break;
+ case F_RDLCK:
+ if (info->lock_type == F_WRLCK)
+ {
+ /*
+ Change RW to READONLY
+
+ mysqld does not turn write locks to read locks,
+ so we're never here in mysqld.
+ */
+ share->w_locks--;
+ share->r_locks++;
+ info->lock_type=lock_type;
+ break;
+ }
+#ifdef MARIA_EXTERNAL_LOCKING
+ if (!share->r_locks && !share->w_locks)
+ {
+ /* note that a transactional table should not do this */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ error=my_errno;
+ break;
+ }
+ }
+#endif
+ VOID(_ma_test_if_changed(info));
+ share->r_locks++;
+ share->tot_locks++;
+ info->lock_type=lock_type;
+ break;
+ case F_WRLCK:
+ if (info->lock_type == F_RDLCK)
+ { /* Change READONLY to RW */
+ if (share->r_locks == 1)
+ {
+ share->r_locks--;
+ share->w_locks++;
+ info->lock_type=lock_type;
+ break;
+ }
+ }
+#ifdef MARIA_EXTERNAL_LOCKING
+ if (!(share->options & HA_OPTION_READ_ONLY_DATA))
+ {
+ if (!share->w_locks)
+ {
+ if (!share->r_locks)
+ {
+ /*
+ Note that transactional tables should not do this.
+ If we enabled this code, we should make sure to skip it if
+ born_transactional is true. We should not test
+ now_transactional to decide if we can call
+ _ma_state_info_read_dsk(), because it can temporarily be 0
+ (TRUNCATE on a partitioned table) and thus it would make a state
+ modification below without mutex, confusing a concurrent
+ checkpoint running.
+ Even if this code was enabled only for non-transactional tables:
+ in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1;
+ state on disk read by DELETE is obsolete as it was not flushed
+ at the end of INSERT. MyISAM same. It however causes no issue as
+ maria_delete_all_rows() calls _ma_reset_status() thus is not
+ influenced by the obsolete read values.
+ */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ error=my_errno;
+ break;
+ }
+ }
+ }
+ }
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+ VOID(_ma_test_if_changed(info));
+
+ info->lock_type=lock_type;
+ info->invalidator=info->s->invalidator;
+ share->w_locks++;
+ share->tot_locks++;
+ break;
+ default:
+ DBUG_ASSERT(0);
+ break; /* Impossible */
+ }
+ }
+#ifdef __WIN__
+ else
+ {
+ /*
+ Check for bad file descriptors if this table is part
+ of a merge union. Failing to capture this may cause
+ a crash on windows if the table is renamed and
+ later on referenced by the merge table.
+ */
+ if( info->owned_by_merge && (info->s)->kfile.file < 0 )
+ {
+ error = HA_ERR_NO_SUCH_TABLE;
+ }
+ }
+#endif
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(error);
+} /* maria_lock_database */
+
+
+/****************************************************************************
+ The following functions are called by thr_lock() in threaded applications
+****************************************************************************/
+
+/*
+ Create a copy of the current status for the table
+
+ SYNOPSIS
+ _ma_get_status()
+ param Pointer to Myisam handler
+ concurrent_insert Set to 1 if we are going to do concurrent inserts
+ (THR_WRITE_CONCURRENT_INSERT was used)
+*/
+
+void _ma_get_status(void* param, int concurrent_insert)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ DBUG_ENTER("_ma_get_status");
+ DBUG_PRINT("info",("key_file: %ld data_file: %ld concurrent_insert: %d",
+ (long) info->s->state.state.key_file_length,
+ (long) info->s->state.state.data_file_length,
+ concurrent_insert));
+#ifndef DBUG_OFF
+ if (info->state->key_file_length > info->s->state.state.key_file_length ||
+ info->state->data_file_length > info->s->state.state.data_file_length)
+ DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld",
+ (long) info->state->key_file_length,
+ (long) info->state->data_file_length));
+#endif
+ info->save_state=info->s->state.state;
+ info->state= &info->save_state;
+ info->append_insert_at_end= concurrent_insert;
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_update_status(void* param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ MARIA_SHARE *share= info->s;
+ /*
+ Because someone may have closed the table we point at, we only
+ update the state if its our own state. This isn't a problem as
+ we are always pointing at our own lock or at a read lock.
+ (This is enforced by thr_multi_lock.c)
+ */
+ if (info->state == &info->save_state)
+ {
+#ifndef DBUG_OFF
+ DBUG_PRINT("info",("updating status: key_file: %ld data_file: %ld",
+ (long) info->state->key_file_length,
+ (long) info->state->data_file_length));
+ if (info->state->key_file_length < share->state.state.key_file_length ||
+ info->state->data_file_length < share->state.state.data_file_length)
+ DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld",
+ (long) share->state.state.key_file_length,
+ (long) share->state.state.data_file_length));
+#endif
+ /*
+ we are going to modify the state without lock's log, this would break
+ recovery if done with a transactional table.
+ */
+ DBUG_ASSERT(!info->s->base.born_transactional);
+ share->state.state= *info->state;
+ info->state= &share->state.state;
+ }
+ info->append_insert_at_end= 0;
+}
+
+
+void _ma_restore_status(void *param)
+{
+ MARIA_HA *info= (MARIA_HA*) param;
+ info->state= &info->s->state.state;
+ info->append_insert_at_end= 0;
+}
+
+
+void _ma_copy_status(void* to,void *from)
+{
+ ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->save_state;
+}
+
+
+/*
+ Check if should allow concurrent inserts
+
+ IMPLEMENTATION
+ Allow concurrent inserts if we don't have a hole in the table or
+ if there is no active write lock and there is active read locks and
+ maria_concurrent_insert == 2. In this last case the new
+ row('s) are inserted at end of file instead of filling up the hole.
+
+ The last case is to allow one to inserts into a heavily read-used table
+ even if there is holes.
+
+ NOTES
+ If there is a an rtree indexes in the table, concurrent inserts are
+ disabled in maria_open()
+
+ RETURN
+ 0 ok to use concurrent inserts
+ 1 not ok
+*/
+
+my_bool _ma_check_status(void *param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ /*
+ The test for w_locks == 1 is here because this thread has already done an
+ external lock (in other words: w_locks == 1 means no other threads has
+ a write lock)
+ */
+ DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u",
+ (long) info->s->state.dellink, (uint) info->s->r_locks,
+ (uint) info->s->w_locks));
+ return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
+ (maria_concurrent_insert == 2 && info->s->r_locks &&
+ info->s->w_locks == 1));
+}
+
+
+/****************************************************************************
+ ** functions to read / write the state
+****************************************************************************/
+
+int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)),
+ int lock_type __attribute__ ((unused)),
+ int check_keybuffer __attribute__ ((unused)))
+{
+#ifdef MARIA_EXTERNAL_LOCKING
+ DBUG_ENTER("_ma_readinfo");
+
+ if (info->lock_type == F_UNLCK)
+ {
+ MARIA_SHARE *share=info->s;
+ if (!share->tot_locks)
+ {
+ /* should not be done for transactional tables */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ int error=my_errno ? my_errno : -1;
+ my_errno=error;
+ DBUG_RETURN(1);
+ }
+ }
+ if (check_keybuffer)
+ VOID(_ma_test_if_changed(info));
+ info->invalidator=info->s->invalidator;
+ }
+ else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK)
+ {
+ my_errno=EACCES; /* Not allowed to change */
+ DBUG_RETURN(-1); /* when have read_lock() */
+ }
+ DBUG_RETURN(0);
+#else
+ return 0;
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+} /* _ma_readinfo */
+
+
+/*
+ Every isam-function that uppdates the isam-database MUST end with this
+ request
+
+ NOTES
+ my_errno is not changed if this succeeds!
+*/
+
+int _ma_writeinfo(register MARIA_HA *info, uint operation)
+{
+ int error,olderror;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_writeinfo");
+ DBUG_PRINT("info",("operation: %u tot_locks: %u", operation,
+ share->tot_locks));
+
+ error=0;
+ if (share->tot_locks == 0 && !share->base.born_transactional)
+ {
+ /* transactional tables flush their state at Checkpoint */
+ if (operation)
+ { /* Two threads can't be here */
+ olderror= my_errno; /* Remember last error */
+ share->state.process= share->last_process= share->this_process;
+ share->state.unique= info->last_unique= info->this_unique;
+ share->state.update_count= info->last_loop= ++info->this_loop;
+ if ((error= _ma_state_info_write_sub(share->kfile.file,
+ &share->state, 1)))
+ olderror=my_errno;
+#ifdef __WIN__
+ if (maria_flush)
+ {
+ _commit(share->kfile.file);
+ _commit(info->dfile.file);
+ }
+#endif
+ my_errno=olderror;
+ }
+ }
+ else if (operation)
+ share->changed= 1; /* Mark keyfile changed */
+ DBUG_RETURN(error);
+} /* _ma_writeinfo */
+
+
+ /* Test if someone has changed the database */
+ /* (Should be called after readinfo) */
+
+int _ma_test_if_changed(register MARIA_HA *info)
+{
+ MARIA_SHARE *share=info->s;
+ if (share->state.process != share->last_process ||
+ share->state.unique != info->last_unique ||
+ share->state.update_count != info->last_loop)
+ { /* Keyfile has changed */
+ DBUG_PRINT("info",("index file changed"));
+ if (share->state.process != share->this_process)
+ VOID(flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_RELEASE));
+ share->last_process=share->state.process;
+ info->last_unique= share->state.unique;
+ info->last_loop= share->state.update_count;
+ info->update|= HA_STATE_WRITTEN; /* Must use file on next */
+ info->data_changed= 1; /* For maria_is_changed */
+ return 1;
+ }
+ return (!(info->update & HA_STATE_AKTIV) ||
+ (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED |
+ HA_STATE_KEY_CHANGED)));
+} /* _ma_test_if_changed */
+
+
+/*
+ Put a mark in the .MYI file that someone is updating the table
+
+
+ DOCUMENTATION
+
+ state.open_count in the .MYI file is used the following way:
+ - For the first change of the .MYI file in this process open_count is
+ incremented by _ma_mark_file_changed(). (We have a write lock on the file
+ when this happens)
+ - In maria_close() it's decremented by _ma_decrement_open_count() if it
+ was incremented in the same process.
+
+ This mean that if we are the only process using the file, the open_count
+ tells us if the MARIA file wasn't properly closed. (This is true if
+ my_disable_locking is set).
+
+ open_count is not maintained on disk for transactional or temporary tables.
+*/
+
+
+int _ma_mark_file_changed(MARIA_HA *info)
+{
+ char buff[3];
+ register MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_mark_file_changed");
+
+ if (!(share->state.changed & STATE_CHANGED) || ! share->global_changed)
+ {
+ share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS);
+ if (!share->global_changed)
+ {
+ share->global_changed=1;
+ share->state.open_count++;
+ }
+ /*
+ temp tables don't need an open_count as they are removed on crash;
+ transactional tables are fixed by log-based recovery, so don't need an
+ open_count either (and we thus avoid the disk write below).
+ */
+ if (!(share->temporary | share->base.born_transactional))
+ {
+ mi_int2store(buff,share->state.open_count);
+ buff[2]=1; /* Mark that it's changed */
+ DBUG_RETURN(my_pwrite(share->kfile.file, buff, sizeof(buff),
+ sizeof(share->state.header),
+ MYF(MY_NABP)));
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite()
+ call. In these context the following code should be safe!
+ */
+
+int _ma_decrement_open_count(MARIA_HA *info)
+{
+ char buff[2];
+ register MARIA_SHARE *share=info->s;
+ int lock_error=0,write_error=0;
+ if (share->global_changed)
+ {
+ uint old_lock=info->lock_type;
+ share->global_changed=0;
+ lock_error=maria_lock_database(info,F_WRLCK);
+ /* Its not fatal even if we couldn't get the lock ! */
+ if (share->state.open_count > 0)
+ {
+ share->state.open_count--;
+ if (!(share->temporary | share->base.born_transactional))
+ {
+ mi_int2store(buff,share->state.open_count);
+ write_error= my_pwrite(share->kfile.file, buff, sizeof(buff),
+ sizeof(share->state.header),
+ MYF(MY_NABP));
+ }
+ }
+ if (!lock_error)
+ lock_error=maria_lock_database(info,old_lock);
+ }
+ return test(lock_error || write_error);
+}
diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c
new file mode 100644
index 00000000000..f3c90ceb1f5
--- /dev/null
+++ b/storage/maria/ma_loghandler.c
@@ -0,0 +1,6778 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+
+/**
+ @file
+ @brief Module which writes and reads to a transaction log
+
+ @todo LOG: in functions where the log's lock is required, a
+ translog_assert_owner() could be added.
+*/
+
+/* number of opened log files in the pagecache (should be at least 2) */
+#define OPENED_FILES_NUM 3
+
+/* records buffer size (should be LOG_PAGE_SIZE * n) */
+#define TRANSLOG_WRITE_BUFFER (1024*1024)
+/* min chunk length */
+#define TRANSLOG_MIN_CHUNK 3
+/*
+ Number of buffers used by loghandler
+
+ Should be at least 4, because one thread can block up to 2 buffers in
+ normal circumstances (less then half of one and full other, or just
+ switched one and other), But if we met end of the file in the middle and
+ have to switch buffer it will be 3. + 1 buffer for flushing/writing.
+ We have a bigger number here for higher concurrency.
+*/
+#define TRANSLOG_BUFFERS_NO 5
+/* number of bytes (+ header) which can be unused on first page in sequence */
+#define TRANSLOG_MINCHUNK_CONTENT 1
+/* version of log file */
+#define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */
+
+#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */
+
+/* QQ: For temporary debugging */
+#define UNRECOVERABLE_ERROR(E) \
+ do { \
+ DBUG_PRINT("error", E); \
+ printf E; \
+ putchar('\n'); \
+ } while(0);
+
+/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */
+#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE)
+#define MAX_NUMBER_OF_LSNS_PER_RECORD 2
+
+/* log write buffer descriptor */
+struct st_translog_buffer
+{
+ LSN last_lsn;
+ /* This buffer offset in the file */
+ TRANSLOG_ADDRESS offset;
+ /*
+ Next buffer offset in the file (it is not always offset + size,
+ in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE)
+ */
+ TRANSLOG_ADDRESS next_buffer_offset;
+ /*
+ How much written (or will be written when copy_to_buffer_in_progress
+ become 0) to this buffer
+ */
+ translog_size_t size;
+ /* File handler for this buffer */
+ File file;
+ /* Threads which are waiting for buffer filling/freeing */
+ WQUEUE waiting_filling_buffer;
+ /* Number of record which are in copy progress */
+ uint copy_to_buffer_in_progress;
+ /* list of waiting buffer ready threads */
+ struct st_my_thread_var *waiting_flush;
+ struct st_translog_buffer *overlay;
+#ifndef DBUG_OFF
+ uint buffer_no;
+#endif
+ /* lock for the buffer. Current buffer also lock the handler */
+ pthread_mutex_t mutex;
+ /* IO cache for current log */
+ uchar buffer[TRANSLOG_WRITE_BUFFER];
+};
+
+
+struct st_buffer_cursor
+{
+ /* pointer on the buffer */
+ uchar *ptr;
+ /* current buffer */
+ struct st_translog_buffer *buffer;
+ /* current page fill */
+ uint16 current_page_fill;
+ /* how many times we finish this page to write it */
+ uint16 write_counter;
+ /* previous write offset */
+ uint16 previous_offset;
+ /* Number of current buffer */
+ uint8 buffer_no;
+ my_bool chaser, protected;
+};
+
+
+struct st_translog_descriptor
+{
+ /* *** Parameters of the log handler *** */
+
+ /* Page cache for the log reads */
+ PAGECACHE *pagecache;
+ /* Flags */
+ uint flags;
+ /* max size of one log size (for new logs creation) */
+ uint32 log_file_max_size;
+ /* server version */
+ uint32 server_version;
+ /* server ID */
+ uint32 server_id;
+ /* Loghandler's buffer capacity in case of chunk 2 filling */
+ uint32 buffer_capacity_chunk_2;
+ /* Half of the buffer capacity in case of chunk 2 filling */
+ uint32 half_buffer_capacity_chunk_2;
+ /* Page overhead calculated by flags */
+ uint16 page_overhead;
+ /* Page capacity calculated by flags (TRANSLOG_PAGE_SIZE-page_overhead-1) */
+ uint16 page_capacity_chunk_2;
+ /* Directory to store files */
+ char directory[FN_REFLEN];
+
+ /* *** Current state of the log handler *** */
+ /* Current and (OPENED_FILES_NUM-1) last logs number in page cache */
+ File log_file_num[OPENED_FILES_NUM];
+ File directory_fd;
+ /* buffers for log writing */
+ struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO];
+ /*
+ horizon - visible end of the log (here is absolute end of the log:
+ position where next chunk can start
+ */
+ TRANSLOG_ADDRESS horizon;
+ /* horizon buffer cursor */
+ struct st_buffer_cursor bc;
+ /* maximum LSN of the current (not finished) file */
+ LSN max_lsn;
+
+ /* Last flushed LSN */
+ LSN flushed;
+ /* Last LSN sent to the disk (but maybe not written yet) */
+ LSN sent_to_file;
+ /* All what is after this address is not sent to disk yet */
+ TRANSLOG_ADDRESS in_buffers_only;
+ pthread_mutex_t sent_to_file_lock;
+ pthread_mutex_t log_flush_lock;
+
+ /* Protects changing of headers of finished files (max_lsn) */
+ pthread_mutex_t file_header_lock;
+
+ /*
+ Sorted array (with protection) of files where we started writing process
+ and so we can't give last LSN yet
+ */
+ pthread_mutex_t unfinished_files_lock;
+ DYNAMIC_ARRAY unfinished_files;
+
+ /* Purger data: minimum file in the log (or 0 if unknown) */
+ uint32 min_file_number;
+ /* Protect purger from many calls and it's data */
+ pthread_mutex_t purger_lock;
+ /* last low water mark checked */
+ LSN last_lsn_checked;
+};
+
+static struct st_translog_descriptor log_descriptor;
+
+/* Marker for end of log */
+static uchar end_of_log= 0;
+
+my_bool translog_inited= 0;
+
+/* chunk types */
+#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
+#define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */
+#define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */
+#define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */
+#define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */
+#define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */
+
+/* compressed (relative) LSN constants */
+#define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */
+
+
+
+#include <my_atomic.h>
+/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */
+static MARIA_SHARE **id_to_share= NULL;
+/* lock for id_to_share */
+static my_atomic_rwlock_t LOCK_id_to_share;
+
+static my_bool write_hook_for_redo(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_undo(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_redo_delete_all(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_undo_row_insert(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_undo_row_delete(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_clr_end(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ struct st_translog_parts *parts);
+static my_bool write_hook_for_file_id(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ struct st_translog_parts *parts);
+
+static my_bool translog_page_validator(uchar *page_addr, uchar* data_ptr);
+
+/*
+ Initialize log_record_type_descriptors
+
+ NOTE that after first public Maria release, these can NOT be changed
+*/
+
+LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+
+
+#ifndef DBUG_OFF
+/**
+ @brief check the description table validity
+
+ @param num how many records should be filled
+*/
+
+static void check_translog_description_table(int num)
+{
+ int i;
+ DBUG_ENTER("check_translog_description_table");
+ DBUG_PRINT("enter", ("last record: %d", num));
+ DBUG_ASSERT(num > 0);
+ /* last is reserved for extending the table */
+ DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1);
+ DBUG_PRINT("info", ("records number: OK"));
+ DBUG_PRINT("info",
+ ("record type: %d class: %d fixed: %u header: %u LSNs: %u "
+ "name: %s",
+ 0,
+ log_record_type_descriptor[0].class,
+ (uint)log_record_type_descriptor[0].fixed_length,
+ (uint)log_record_type_descriptor[0].read_header_len,
+ (uint)log_record_type_descriptor[0].compressed_LSN,
+ log_record_type_descriptor[0].name));
+ DBUG_ASSERT(log_record_type_descriptor[0].class == LOGRECTYPE_NOT_ALLOWED);
+ DBUG_PRINT("info", ("record type 0: OK"));
+ for (i= 1; i <= num; i++)
+ {
+ DBUG_PRINT("info",
+ ("record type: %d class: %d fixed: %u header: %u LSNs: %u "
+ "name: %s",
+ i, log_record_type_descriptor[i].class,
+ (uint)log_record_type_descriptor[i].fixed_length,
+ (uint)log_record_type_descriptor[i].read_header_len,
+ (uint)log_record_type_descriptor[i].compressed_LSN,
+ log_record_type_descriptor[i].name));
+ switch (log_record_type_descriptor[i].class) {
+ case LOGRECTYPE_NOT_ALLOWED:
+ DBUG_ASSERT(0);
+ break;
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0);
+ DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) ||
+ ((log_record_type_descriptor[i].compressed_LSN == 1) &&
+ (log_record_type_descriptor[i].read_header_len >=
+ LSN_STORE_SIZE)) ||
+ ((log_record_type_descriptor[i].compressed_LSN == 2) &&
+ (log_record_type_descriptor[i].read_header_len >=
+ LSN_STORE_SIZE * 2)));
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+ log_record_type_descriptor[i].read_header_len);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2);
+ break;
+ case LOGRECTYPE_FIXEDLENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+ log_record_type_descriptor[i].read_header_len);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ DBUG_PRINT("info", ("record type %d: OK", i));
+ }
+ DBUG_PRINT("info", ("All filled records are OK"));
+ for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++)
+ {
+ DBUG_ASSERT(log_record_type_descriptor[i].class == LOGRECTYPE_NOT_ALLOWED);
+ DBUG_PRINT("info", ("record type %d: OK", i));
+ }
+ DBUG_VOID_RETURN;
+}
+#endif
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0,
+"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1,
+"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1,
+"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2,
+"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2,
+"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+
+void example_loghandler_init()
+{
+ int i;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE;
+ for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1;
+ i < LOGREC_NUMBER_OF_TYPES;
+ i++)
+ log_record_type_descriptor[i].class= LOGRECTYPE_NOT_ALLOWED;
+ DBUG_EXECUTE("info",
+ check_translog_description_table(LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE););
+}
+
+
+static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23=
+{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0,
+ "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL };
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/** @todo RECOVERY BUG unused, remove? */
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOB=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 8, NULL, write_hook_for_redo, NULL, 0,
+ "redo_insert_row_blob", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/** @todo RECOVERY BUG handle it in recovery */
+/*QQQ:TODO:header???*/
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_BLOCKS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/* not yet used; for when we have versioning */
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/** @todo RECOVERY BUG unused, remove? */
+static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CLR_END=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, LSN_STORE_SIZE + FILEID_STORE_SIZE + 1,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + 1, NULL, write_hook_for_clr_end, NULL, 1,
+ "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PURGE_END=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1,
+ "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_insert, NULL, 1,
+ "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_delete, NULL, 1,
+ "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo, NULL, 1,
+ "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 10, NULL, write_hook_for_undo, NULL, 1,
+ "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 15, NULL, write_hook_for_undo, NULL, 1,
+ "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1,
+ "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT=
+{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL,
+ NULL, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL,
+ NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1,
+ "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CHECKPOINT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0,
+"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
+ NULL, write_hook_for_redo_delete_all, NULL, 0,
+ "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 4, FILEID_STORE_SIZE + 4,
+ NULL, NULL, NULL, 0,
+ "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FILE_ID=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0,
+ "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
+
+static void loghandler_init()
+{
+ int i;
+ log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]=
+ INIT_LOGREC_RESERVED_FOR_CHUNKS23;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]=
+ INIT_LOGREC_REDO_INSERT_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]=
+ INIT_LOGREC_REDO_INSERT_ROW_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOB]=
+ INIT_LOGREC_REDO_INSERT_ROW_BLOB;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]=
+ INIT_LOGREC_REDO_INSERT_ROW_BLOBS;
+ log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]=
+ INIT_LOGREC_REDO_PURGE_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]=
+ INIT_LOGREC_REDO_PURGE_ROW_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_PURGE_BLOCKS]=
+ INIT_LOGREC_REDO_PURGE_BLOCKS;
+ log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]=
+ INIT_LOGREC_REDO_DELETE_ROW;
+ log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]=
+ INIT_LOGREC_REDO_UPDATE_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_INDEX]=
+ INIT_LOGREC_REDO_INDEX;
+ log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]=
+ INIT_LOGREC_REDO_UNDELETE_ROW;
+ log_record_type_descriptor[LOGREC_CLR_END]=
+ INIT_LOGREC_CLR_END;
+ log_record_type_descriptor[LOGREC_PURGE_END]=
+ INIT_LOGREC_PURGE_END;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]=
+ INIT_LOGREC_UNDO_ROW_INSERT;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]=
+ INIT_LOGREC_UNDO_ROW_DELETE;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]=
+ INIT_LOGREC_UNDO_ROW_UPDATE;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]=
+ INIT_LOGREC_UNDO_KEY_INSERT;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]=
+ INIT_LOGREC_UNDO_KEY_DELETE;
+ log_record_type_descriptor[LOGREC_PREPARE]=
+ INIT_LOGREC_PREPARE;
+ log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]=
+ INIT_LOGREC_PREPARE_WITH_UNDO_PURGE;
+ log_record_type_descriptor[LOGREC_COMMIT]=
+ INIT_LOGREC_COMMIT;
+ log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]=
+ INIT_LOGREC_COMMIT_WITH_UNDO_PURGE;
+ log_record_type_descriptor[LOGREC_CHECKPOINT]=
+ INIT_LOGREC_CHECKPOINT;
+ log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]=
+ INIT_LOGREC_REDO_CREATE_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]=
+ INIT_LOGREC_REDO_RENAME_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]=
+ INIT_LOGREC_REDO_DROP_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]=
+ INIT_LOGREC_REDO_DELETE_ALL;
+ log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]=
+ INIT_LOGREC_REDO_REPAIR_TABLE;
+ log_record_type_descriptor[LOGREC_FILE_ID]=
+ INIT_LOGREC_FILE_ID;
+ log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]=
+ INIT_LOGREC_LONG_TRANSACTION_ID;
+ for (i= LOGREC_LONG_TRANSACTION_ID + 1;
+ i < LOGREC_NUMBER_OF_TYPES;
+ i++)
+ log_record_type_descriptor[i].class= LOGRECTYPE_NOT_ALLOWED;
+ DBUG_EXECUTE("info",
+ check_translog_description_table(LOGREC_LONG_TRANSACTION_ID););
+};
+
+
+/* all possible flags page overheads */
+static uint page_overhead[TRANSLOG_FLAGS_NUM];
+
+typedef struct st_translog_validator_data
+{
+ TRANSLOG_ADDRESS *addr;
+ my_bool was_recovered;
+} TRANSLOG_VALIDATOR_DATA;
+
+
+const char *maria_data_root;
+
+
+/*
+ Check cursor/buffer consistence
+
+ SYNOPSIS
+ translog_check_cursor
+ cursor cursor which will be checked
+*/
+
+#ifndef DBUG_OFF
+static void translog_check_cursor(struct st_buffer_cursor *cursor)
+{
+ DBUG_ASSERT(cursor->chaser ||
+ ((ulong) (cursor->ptr - cursor->buffer->buffer) ==
+ cursor->buffer->size));
+ DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no);
+ DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE ==
+ cursor->current_page_fill % TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+}
+#endif
+
+/*
+ Get file name of the log by log number
+
+ SYNOPSIS
+ translog_filename_by_fileno()
+ file_no Number of the log we want to open
+ path Pointer to buffer where file name will be
+ stored (must be FN_REFLEN bytes at least
+ RETURN
+ pointer to path
+*/
+
+static char *translog_filename_by_fileno(uint32 file_no, char *path)
+{
+ char file_name[10 + 8 + 1]; /* See fallowing my_sprintf() call */
+ char *res;
+ DBUG_ENTER("translog_filename_by_fileno");
+ DBUG_ASSERT(file_no <= 0xfffffff);
+ my_sprintf(file_name, (file_name, "maria_log.%08u", file_no));
+ res= fn_format(path, file_name, log_descriptor.directory, "", MYF(MY_WME));
+ DBUG_PRINT("info", ("Path: '%s' path: 0x%lx res: 0x%lx",
+ res, (ulong) path, (ulong) res));
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Open log file with given number without cache
+
+ SYNOPSIS
+ open_logfile_by_number_no_cache()
+ file_no Number of the log we want to open
+
+ RETURN
+ -1 error
+ # file descriptor number
+*/
+
+static File open_logfile_by_number_no_cache(uint32 file_no)
+{
+ File file;
+ char path[FN_REFLEN];
+ DBUG_ENTER("open_logfile_by_number_no_cache");
+
+ /* TODO: add O_DIRECT to open flags (when buffer is aligned) */
+ /* TODO: use my_create() */
+ if ((file= my_open(translog_filename_by_fileno(file_no, path),
+ O_CREAT | O_BINARY | O_RDWR,
+ MYF(MY_WME))) < 0)
+ {
+ UNRECOVERABLE_ERROR(("Error %d during opening file '%s'", errno, path));
+ DBUG_RETURN(-1);
+ }
+ DBUG_PRINT("info", ("File: '%s' handler: %d", path, file));
+ DBUG_RETURN(file);
+}
+
+
+/*
+ Write log file page header in the just opened new log file
+
+ SYNOPSIS
+ translog_write_file_header();
+
+ NOTES
+ First page is just a marker page; We don't store any real log data in it.
+
+ RETURN
+ 0 OK
+ 1 ERROR
+*/
+
+uchar NEAR maria_trans_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A',
+ 'L', 'O', 'G' };
+
+static my_bool translog_write_file_header()
+{
+ ulonglong timestamp;
+ uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff;
+ DBUG_ENTER("translog_write_file_header");
+
+ /* file tag */
+ memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic));
+ page+= sizeof(maria_trans_file_magic);
+ /* timestamp */
+ timestamp= my_getsystime();
+ int8store(page, timestamp);
+ page+= 8;
+ /* maria version */
+ int4store(page, TRANSLOG_VERSION_ID);
+ page+= 4;
+ /* mysql version (MYSQL_VERSION_ID) */
+ int4store(page, log_descriptor.server_version);
+ page+= 4;
+ /* server ID */
+ int4store(page, log_descriptor.server_id);
+ page+= 4;
+ /* loghandler page_size/DISK_DRIVE_SECTOR_SIZE */
+ int2store(page, TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE);
+ page+= 2;
+ /* file number */
+ int3store(page, LSN_FILE_NO(log_descriptor.horizon));
+ page+= 3;
+ /*
+ Here should be max lsn storing for current file (which is LSN_IPOSSIBLE):
+ lsn_store(page, LSN_IPOSSIBLE);
+ page+= LSN_STORE_SIZE;
+ But it is zeros so we can rely on bzero() in this case
+ */
+ bzero(page, sizeof(page_buff) - (page- page_buff));
+
+ DBUG_RETURN(my_pwrite(log_descriptor.log_file_num[0], page_buff,
+ sizeof(page_buff), 0, log_write_flags) != 0);
+}
+
+/*
+ @brief write the new LSN on the given file header
+
+ @param file The file descriptor
+ @param lsn That LSN which should be written
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_max_lsn_to_header(File file, LSN lsn)
+{
+ uchar lsn_buff[LSN_STORE_SIZE];
+ DBUG_ENTER("translog_max_lsn_to_header");
+ DBUG_PRINT("enter", ("File descriptor: %ld "
+ "lsn: (%lu,0x%lx)",
+ (long) file,
+ LSN_IN_PARTS(lsn)));
+
+ lsn_store(lsn_buff, lsn);
+
+ DBUG_RETURN(my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (sizeof(maria_trans_file_magic) +
+ 8 + 4 + 4 + 4 + 2 + 3),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+}
+
+
+/*
+ Information from transaction log file header
+*/
+
+typedef struct st_loghandler_file_info
+{
+ /*
+ LSN_IMPOSSIBLE for current file and max LSN which parts stored in the
+ file for all other (finished) files.
+ */
+ LSN max_lsn;
+ ulonglong timestamp; /* Time stamp */
+ ulong maria_version; /* Version of maria loghandler */
+ ulong mysql_versiob; /* Version of mysql server */
+ ulong server_id; /* Server ID */
+ uint page_size; /* Loghandler page size */
+ uint file_number; /* Number of the file (from the file header) */
+} LOGHANDLER_FILE_INFO;
+
+/*
+ @brief Read hander file information from loghandler file
+
+ @param desc header information descriptor to be filled with information
+ @param file file descriptor to read
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \
+ 8 + 4 + 4 + 4 + 2 + 3 + \
+ LSN_STORE_SIZE)
+
+my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file)
+{
+ uchar page_buff[LOG_HEADER_DATA_SIZE], *ptr;
+ DBUG_ENTER("translog_read_file_header");
+
+ if (my_pread(file, page_buff,
+ sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME)))
+ {
+ DBUG_PRINT("info", ("log read fail error: %d", my_errno));
+ DBUG_RETURN(1);
+ }
+ ptr= page_buff + sizeof(maria_trans_file_magic);
+ desc->timestamp= uint8korr(ptr);
+ ptr+= 8;
+ desc->maria_version= uint4korr(ptr);
+ ptr+= 4;
+ desc->mysql_versiob= uint4korr(ptr);
+ ptr+= 4;
+ desc->server_id= uint4korr(ptr);
+ ptr+= 4;
+ desc->page_size= uint2korr(ptr);
+ ptr+= 2;
+ desc->file_number= uint3korr(ptr);
+ ptr+=3;
+ desc->max_lsn= lsn_korr(ptr);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ @brief set the lsn to the files from_file - to_file if it is greater
+ then written in the file
+
+ @param from_file first file number (min)
+ @param to_file last file number (max)
+ @param lsn the lsn for writing
+ @param is_locked true if current thread locked the log handler
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file,
+ LSN lsn, my_bool is_locked)
+{
+ uint32 file;
+ DBUG_ENTER("translog_set_lsn_for_files");
+ DBUG_PRINT("enter", ("From: %lu to: %lu lsn: (%lu,0x%lx) locked: %d",
+ (ulong) from_file, (ulong) to_file,
+ LSN_IN_PARTS(lsn),
+ is_locked));
+ DBUG_ASSERT(from_file <= to_file);
+ DBUG_ASSERT(from_file > 0); /* we have not file 0 */
+
+ /* Checks the current file (not finished yet file) */
+ if (!is_locked)
+ translog_lock();
+ if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon))
+ {
+ if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0))
+ log_descriptor.max_lsn= lsn;
+ to_file--;
+ }
+ if (!is_locked)
+ translog_unlock();
+
+ /* Checks finished files if they are */
+ pthread_mutex_lock(&log_descriptor.file_header_lock);
+ for (file= from_file; file <= to_file; file++)
+ {
+ LOGHANDLER_FILE_INFO info;
+ File fd= open_logfile_by_number_no_cache(file);
+ if (fd < 0 ||
+ translog_read_file_header(&info, fd) ||
+ (cmp_translog_addr(lsn, info.max_lsn) > 0 &&
+ translog_max_lsn_to_header(fd, lsn)))
+ DBUG_RETURN(1);
+ }
+ pthread_mutex_unlock(&log_descriptor.file_header_lock);
+
+ DBUG_RETURN(0);
+}
+
+
+/* descriptor of file in unfinished_files */
+struct st_file_counter
+{
+ uint32 file; /* file number */
+ uint32 counter; /* counter for started writes */
+};
+
+
+/*
+ @brief mark file "in progress" (for multi-group records)
+
+ @param file log file number
+*/
+
+static void translog_mark_file_unfinished(uint32 file)
+{
+ int place, i;
+ struct st_file_counter fc, *fc_ptr;
+ fc.file= file; fc.counter= 1;
+
+ DBUG_ENTER("translog_mark_file_unfinished");
+ DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ if (log_descriptor.unfinished_files.elements == 0)
+ {
+ insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+ DBUG_PRINT("info", ("The first element inserted"));
+ goto end;
+ }
+
+ for (place= log_descriptor.unfinished_files.elements - 1;
+ place >= 0;
+ place--)
+ {
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ place, struct st_file_counter *);
+ if (fc_ptr->file <= file)
+ break;
+ }
+
+ if (place >= 0 && fc_ptr->file == file)
+ {
+ fc_ptr->counter++;
+ DBUG_PRINT("info", ("counter increased"));
+ goto end;
+ }
+
+ if (place == (int)log_descriptor.unfinished_files.elements)
+ {
+ insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+ DBUG_PRINT("info", ("The last element inserted"));
+ goto end;
+ }
+ /* shift and assign new element */
+ insert_dynamic(&log_descriptor.unfinished_files,
+ (uchar*)
+ dynamic_element(&log_descriptor.unfinished_files,
+ log_descriptor.unfinished_files.elements- 1,
+ struct st_file_counter *));
+ for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--)
+ {
+ /* we do not use set_dynamic() to avoid unneeded checks */
+ memcpy(dynamic_element(&log_descriptor.unfinished_files,
+ i, struct st_file_counter *),
+ dynamic_element(&log_descriptor.unfinished_files,
+ i + 1, struct st_file_counter *),
+ sizeof(struct st_file_counter));
+ }
+ memcpy(dynamic_element(&log_descriptor.unfinished_files,
+ place + 1, struct st_file_counter *),
+ &fc, sizeof(struct st_file_counter));
+end:
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+
+/*
+ @brief remove file mark "in progress" (for multi-group records)
+
+ @param file log file number
+*/
+
+static void translog_mark_file_finished(uint32 file)
+{
+ int i;
+ struct st_file_counter *fc_ptr;
+
+ DBUG_ENTER("translog_mark_file_finished");
+ DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0);
+ for (i= 0;
+ i < (int) log_descriptor.unfinished_files.elements;
+ i++)
+ {
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ i, struct st_file_counter *);
+ if (fc_ptr->file == file)
+ {
+ break;
+ }
+ }
+ DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements);
+
+ if (! --fc_ptr->counter)
+ delete_dynamic_element(&log_descriptor.unfinished_files, i);
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief get max LSN of the record which parts stored in this file
+
+ @param file file number
+
+ @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR
+ @retval LSN_IMPOSSIBLE File is still not finished
+ @retval LSN_ERROR Error opening file
+ @retval # LSN of the record which parts stored in this file
+*/
+
+LSN translog_get_file_max_lsn_stored(uint32 file)
+{
+ uint32 limit= FILENO_IMPOSSIBLE;
+ DBUG_ENTER("translog_get_file_max_lsn_stored");
+ DBUG_PRINT("enter", ("file: %lu", (ulong)file));
+ DBUG_ASSERT(translog_inited == 1);
+
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ /* find file with minimum file number "in progress" */
+ if (log_descriptor.unfinished_files.elements > 0)
+ {
+ struct st_file_counter *fc_ptr;
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ 0, struct st_file_counter *);
+ limit= fc_ptr->file; /* minimal file number "in progress" */
+ }
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+
+ /*
+ if there is no "in progress file" then unfinished file is in progress
+ for sure
+ */
+ if (limit == FILENO_IMPOSSIBLE)
+ {
+ TRANSLOG_ADDRESS horizon= translog_get_horizon();
+ limit= LSN_FILE_NO(horizon);
+ }
+
+ if (file >= limit)
+ {
+ DBUG_PRINT("info", ("The file in in progress"));
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ }
+
+ {
+ LOGHANDLER_FILE_INFO info;
+ File fd= open_logfile_by_number_no_cache(file);
+ if (fd < 0 ||
+ translog_read_file_header(&info, fd))
+ {
+ DBUG_PRINT("error", ("Can't read file header"));
+ DBUG_RETURN(LSN_ERROR);
+ }
+ DBUG_PRINT("error", ("Max lsn: (%lu,0x%lx)",
+ LSN_IN_PARTS(info.max_lsn)));
+ DBUG_RETURN(info.max_lsn);
+ }
+}
+
+/*
+ Initialize transaction log file buffer
+
+ SYNOPSIS
+ translog_buffer_init()
+ buffer The buffer to initialize
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_buffer_init(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_init");
+ buffer->last_lsn= LSN_IMPOSSIBLE;
+ /* This Buffer File */
+ buffer->file= -1;
+ buffer->overlay= 0;
+ /* IO cache for current log */
+ bzero(buffer->buffer, TRANSLOG_WRITE_BUFFER);
+ /* Buffer size */
+ buffer->size= 0;
+ /* cond of thread which is waiting for buffer filling */
+ buffer->waiting_filling_buffer.last_thread= 0;
+ /* Number of record which are in copy progress */
+ buffer->copy_to_buffer_in_progress= 0;
+ /* list of waiting buffer ready threads */
+ buffer->waiting_flush= 0;
+ /* lock for the buffer. Current buffer also lock the handler */
+ if (pthread_mutex_init(&buffer->mutex, MY_MUTEX_INIT_FAST))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Close transaction log file by descriptor
+
+ SYNOPSIS
+ translog_close_log_file()
+ file file descriptor
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_close_log_file(File file)
+{
+ int rc;
+ PAGECACHE_FILE fl;
+ fl.file= file;
+ flush_pagecache_blocks(log_descriptor.pagecache, &fl, FLUSH_RELEASE);
+ /*
+ Sync file when we close it
+ TODO: sync only we have changed the log
+ */
+ rc= my_sync(file, MYF(MY_WME));
+ rc|= my_close(file, MYF(MY_WME));
+ return test(rc);
+}
+
+
+/*
+ Create and fill header of new file
+
+ SYNOPSIS
+ translog_create_new_file()
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_create_new_file()
+{
+ int i;
+ uint32 file_no= LSN_FILE_NO(log_descriptor.horizon);
+ DBUG_ENTER("translog_create_new_file");
+
+ /*
+ Writes max_lsn to the file header before finishing it (it is no need to
+ lock file header buffer because it is still unfinished file)
+ */
+ translog_max_lsn_to_header(log_descriptor.log_file_num[0],
+ log_descriptor.max_lsn);
+ log_descriptor.max_lsn= LSN_IMPOSSIBLE;
+
+ if (log_descriptor.log_file_num[OPENED_FILES_NUM - 1] != -1 &&
+ translog_close_log_file(log_descriptor.log_file_num[OPENED_FILES_NUM -
+ 1]))
+ DBUG_RETURN(1);
+ for (i= OPENED_FILES_NUM - 1; i > 0; i--)
+ log_descriptor.log_file_num[i]= log_descriptor.log_file_num[i - 1];
+
+ if ((log_descriptor.log_file_num[0]=
+ open_logfile_by_number_no_cache(file_no)) == -1 ||
+ translog_write_file_header())
+ DBUG_RETURN(1);
+
+ if (ma_control_file_write_and_force(LSN_IMPOSSIBLE, file_no,
+ CONTROL_FILE_UPDATE_ONLY_LOGNO))
+ DBUG_RETURN(1);
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Lock the loghandler buffer
+
+ SYNOPSIS
+ translog_buffer_lock()
+ buffer This buffer which should be locked
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+#ifndef DBUG_OFF
+static my_bool translog_buffer_lock(struct st_translog_buffer *buffer)
+{
+ int res;
+ DBUG_ENTER("translog_buffer_lock");
+ DBUG_PRINT("enter",
+ ("Lock buffer #%u: (0x%lx) mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ res= (pthread_mutex_lock(&buffer->mutex) != 0);
+ DBUG_RETURN(res);
+}
+#else
+#define translog_buffer_lock(B) \
+ pthread_mutex_lock(&B->mutex)
+#endif
+
+
+/*
+ Unlock the loghandler buffer
+
+ SYNOPSIS
+ translog_buffer_unlock()
+ buffer This buffer which should be unlocked
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+#ifndef DBUG_OFF
+static my_bool translog_buffer_unlock(struct st_translog_buffer *buffer)
+{
+ int res;
+ DBUG_ENTER("translog_buffer_unlock");
+ DBUG_PRINT("enter", ("Unlock buffer... #%u (0x%lx) "
+ "mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+
+ res= (pthread_mutex_unlock(&buffer->mutex) != 0);
+ DBUG_PRINT("enter", ("Unlocked buffer... #%u: 0x%lx mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ DBUG_RETURN(res);
+}
+#else
+#define translog_buffer_unlock(B) \
+ pthread_mutex_unlock(&B->mutex)
+#endif
+
+
+/*
+ Write a header on the page
+
+ SYNOPSIS
+ translog_new_page_header()
+ horizon Where to write the page
+ cursor Where to write the page
+
+ NOTE
+ - space for page header should be checked before
+*/
+
+static void translog_new_page_header(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ uchar *ptr;
+
+ DBUG_ENTER("translog_new_page_header");
+ DBUG_ASSERT(cursor->ptr);
+
+ cursor->protected= 0;
+
+ ptr= cursor->ptr;
+ /* Page number */
+ int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE);
+ ptr+= 3;
+ /* File number */
+ int3store(ptr, LSN_FILE_NO(*horizon));
+ ptr+= 3;
+ *(ptr++)= (uchar) log_descriptor.flags;
+ if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+ {
+#ifndef DBUG_OFF
+ DBUG_PRINT("info", ("write 0x11223344 CRC to (%lu,0x%lx)",
+ LSN_IN_PARTS(*horizon)));
+ /* This will be overwritten by real CRC; This is just for debugging */
+ int4store(ptr, 0x11223344);
+#endif
+ /* CRC will be put when page is finished */
+ ptr+= CRC_LENGTH;
+ }
+ if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+ {
+ time_t tm;
+ uint16 tmp_time= time(&tm);
+ int2store(ptr, tmp_time);
+ ptr+= (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ }
+ {
+ uint len= (ptr - cursor->ptr);
+ (*horizon)+= len; /* it is increasing of offset part of the address */
+ cursor->current_page_fill= len;
+ if (!cursor->chaser)
+ cursor->buffer->size+= len;
+ }
+ cursor->ptr= ptr;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Put sector protection on the page image
+
+ SYNOPSIS
+ translog_put_sector_protection()
+ page reference on the page content
+ cursor cursor of the buffer
+
+ NOTES
+ We put a sector protection on all following sectors on the page,
+ except the first sector that is protected by page header.
+*/
+
+static void translog_put_sector_protection(uchar *page,
+ struct st_buffer_cursor *cursor)
+{
+ uchar *table= page + log_descriptor.page_overhead -
+ (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ uint16 value= uint2korr(table) + cursor->write_counter;
+ uint16 last_protected_sector= ((cursor->previous_offset - 1) /
+ DISK_DRIVE_SECTOR_SIZE);
+ uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE;
+ uint i, offset;
+ DBUG_ENTER("translog_put_sector_protection");
+
+ if (start_sector == 0)
+ start_sector= 1; /* First sector is protected */
+
+ DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, "
+ "last protected:%u start sector:%u",
+ (uint) cursor->write_counter,
+ (uint) value,
+ (uint) cursor->previous_offset,
+ (uint) last_protected_sector, (uint) start_sector));
+ if (last_protected_sector == start_sector)
+ {
+ i= last_protected_sector * 2;
+ offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE;
+ /* restore data, because we modified sector which was protected */
+ if (offset < cursor->previous_offset)
+ page[offset]= table[i];
+ offset++;
+ if (offset < cursor->previous_offset)
+ page[offset]= table[i + 1];
+ }
+ for (i= start_sector * 2, offset= start_sector * DISK_DRIVE_SECTOR_SIZE;
+ i < (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ (i+= 2), (offset+= DISK_DRIVE_SECTOR_SIZE))
+ {
+ DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x%x",
+ i / 2, offset, (uint) page[offset],
+ (uint) page[offset + 1]));
+ table[i]= page[offset];
+ table[i + 1]= page[offset + 1];
+ int2store(page + offset, value);
+ DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x%x",
+ i / 2, offset, (uint) page[offset],
+ (uint) page[offset + 1]));
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Calculate CRC32 of given area
+
+ SYNOPSIS
+ translog_crc()
+ area Pointer of the area beginning
+ length The Area length
+
+ RETURN
+ CRC32
+*/
+
+static uint32 translog_crc(uchar *area, uint length)
+{
+ DBUG_ENTER("translog_crc");
+ DBUG_RETURN(crc32(0L, (unsigned char*) area, length));
+}
+
+
+/*
+ Finish current page with zeros
+
+ SYNOPSIS
+ translog_finish_page()
+ horizon \ horizon & buffer pointers
+ cursor /
+*/
+
+static void translog_finish_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill;
+ uchar *page= cursor->ptr -cursor->current_page_fill;
+ DBUG_ENTER("translog_finish_page");
+ DBUG_PRINT("enter", ("Buffer: #%u 0x%lx "
+ "Buffer addr: (%lu,0x%lx) "
+ "Page addr: (%lu,0x%lx) "
+ "size:%lu (%lu) Pg:%u left:%u",
+ (uint) cursor->buffer_no, (ulong) cursor->buffer,
+ LSN_IN_PARTS(cursor->buffer->offset),
+ (ulong) LSN_FILE_NO(*horizon),
+ (ulong) (LSN_OFFSET(*horizon) -
+ cursor->current_page_fill),
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr -cursor->buffer->buffer),
+ (uint) cursor->current_page_fill, (uint) left));
+ DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+ if (cursor->protected)
+ {
+ DBUG_PRINT("info", ("Already protected and finished"));
+ DBUG_VOID_RETURN;
+ }
+ cursor->protected= 1;
+
+ DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+ if (left != 0)
+ {
+ DBUG_PRINT("info", ("left: %u", (uint) left));
+ bzero(cursor->ptr, left);
+ cursor->ptr +=left;
+ (*horizon)+= left; /* offset increasing */
+ if (!cursor->chaser)
+ cursor->buffer->size+= left;
+ cursor->current_page_fill= 0;
+ DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no,
+ (ulong) cursor->buffer, cursor->chaser,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+ }
+ if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+ {
+ translog_put_sector_protection(page, cursor);
+ DBUG_PRINT("info", ("drop write_counter"));
+ cursor->write_counter= 0;
+ cursor->previous_offset= 0;
+ }
+ if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(page + log_descriptor.page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ log_descriptor.page_overhead);
+ DBUG_PRINT("info", ("CRC: %lx", (ulong) crc));
+ /* We have page number, file number and flag before crc */
+ int4store(page + 3 + 3 + 1, crc);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Wait until all thread finish filling this buffer
+
+ SYNOPSIS
+ translog_wait_for_writers()
+ buffer This buffer should be check
+
+ NOTE
+ This buffer should be locked
+*/
+
+static void translog_wait_for_writers(struct st_translog_buffer *buffer)
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ DBUG_ENTER("translog_wait_for_writers");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (int) buffer->copy_to_buffer_in_progress));
+
+ while (buffer->copy_to_buffer_in_progress)
+ {
+ DBUG_PRINT("info", ("wait for writers... "
+ "buffer: #%u 0x%lx "
+ "mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ DBUG_ASSERT(buffer->file != -1);
+ wqueue_add_and_wait(&buffer->waiting_filling_buffer, thread,
+ &buffer->mutex);
+ DBUG_PRINT("info", ("wait for writers done "
+ "buffer: #%u 0x%lx "
+ "mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+
+ Wait for buffer to become free
+
+ SYNOPSIS
+ translog_wait_for_buffer_free()
+ buffer The buffer we are waiting for
+
+ NOTE
+ - this buffer should be locked
+*/
+
+static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer)
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ DBUG_ENTER("translog_wait_for_buffer_free");
+ DBUG_PRINT("enter", ("Buffer: #%u 0x%lx copies in progress: %u "
+ "File: %d size: 0x%lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (int) buffer->copy_to_buffer_in_progress,
+ buffer->file, (ulong) buffer->size));
+
+ translog_wait_for_writers(buffer);
+
+ while (buffer->file != -1)
+ {
+ DBUG_PRINT("info", ("wait for writers... "
+ "buffer: #%u 0x%lx "
+ "mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ wqueue_add_and_wait(&buffer->waiting_filling_buffer, thread,
+ &buffer->mutex);
+ DBUG_PRINT("info", ("wait for writers done. "
+ "buffer: #%u 0x%lx "
+ "mutex: 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) &buffer->mutex));
+ }
+ DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Initialize the cursor for a buffer
+
+ SYNOPSIS
+ translog_cursor_init()
+ buffer The buffer
+ cursor It's cursor
+ buffer_no Number of buffer
+*/
+
+static void translog_cursor_init(struct st_buffer_cursor *cursor,
+ struct st_translog_buffer *buffer,
+ uint8 buffer_no)
+{
+ DBUG_ENTER("translog_cursor_init");
+ cursor->ptr= buffer->buffer;
+ cursor->buffer= buffer;
+ cursor->buffer_no= buffer_no;
+ cursor->current_page_fill= 0;
+ cursor->chaser= (cursor != &log_descriptor.bc);
+ cursor->write_counter= 0;
+ cursor->previous_offset= 0;
+ cursor->protected= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Initialize buffer for current file
+
+ SYNOPSIS
+ translog_start_buffer()
+ buffer The buffer
+ cursor It's cursor
+ buffer_no Number of buffer
+*/
+
+static void translog_start_buffer(struct st_translog_buffer *buffer,
+ struct st_buffer_cursor *cursor,
+ uint buffer_no)
+{
+ DBUG_ENTER("translog_start_buffer");
+ DBUG_PRINT("enter",
+ ("Assign buffer: #%u (0x%lx) to file: %d offset: 0x%lx(%lu)",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ log_descriptor.log_file_num[0],
+ (ulong) LSN_OFFSET(log_descriptor.horizon),
+ (ulong) LSN_OFFSET(log_descriptor.horizon)));
+ DBUG_ASSERT(buffer_no == buffer->buffer_no);
+ buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->offset= log_descriptor.horizon;
+ buffer->next_buffer_offset= LSN_IMPOSSIBLE;
+ buffer->file= log_descriptor.log_file_num[0];
+ buffer->overlay= 0;
+ buffer->size= 0;
+ translog_cursor_init(cursor, buffer, buffer_no);
+ DBUG_PRINT("info", ("init cursor #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Switch to the next buffer in a chain
+
+ SYNOPSIS
+ translog_buffer_next()
+ horizon \ Pointers on current position in file and buffer
+ cursor /
+ next_file Also start new file
+
+ NOTE:
+ - loghandler should be locked
+ - after return new and old buffer still are locked
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ my_bool new_file)
+{
+ uint old_buffer_no= cursor->buffer_no;
+ uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no;
+ my_bool chasing= cursor->chaser;
+ DBUG_ENTER("translog_buffer_next");
+
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx) chasing: %d",
+ LSN_IN_PARTS(log_descriptor.horizon), chasing));
+
+ DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0);
+
+ translog_finish_page(horizon, cursor);
+
+ if (!chasing)
+ {
+ translog_buffer_lock(new_buffer);
+ translog_wait_for_buffer_free(new_buffer);
+ }
+#ifndef DBUG_OFF
+ else
+ DBUG_ASSERT(new_buffer->file != 0);
+#endif
+ if (new_file)
+ {
+
+ /* move the horizon to the next file and its header page */
+ (*horizon)+= LSN_ONE_FILE;
+ (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE);
+ if (!chasing && translog_create_new_file())
+ {
+ DBUG_RETURN(1);
+ }
+ }
+
+ /* prepare next page */
+ if (chasing)
+ translog_cursor_init(cursor, new_buffer, new_buffer_no);
+ else
+ translog_start_buffer(new_buffer, cursor, new_buffer_no);
+ log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset;
+ translog_new_page_header(horizon, cursor);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Sets max LSN sent to file, and address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_set_sent_to_file()
+ lsn LSN to assign
+ in_buffers to assign to in_buffers_only
+
+ TODO: use atomic operations if possible (64bit architectures?)
+*/
+
+static void translog_set_sent_to_file(LSN lsn, TRANSLOG_ADDRESS in_buffers)
+{
+ DBUG_ENTER("translog_set_sent_to_file");
+ pthread_mutex_lock(&log_descriptor.sent_to_file_lock);
+ DBUG_PRINT("enter", ("lsn: (%lu,0x%lx) in_buffers: (%lu,0x%lx) "
+ "in_buffers_only: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn),
+ LSN_IN_PARTS(in_buffers),
+ LSN_IN_PARTS(log_descriptor.in_buffers_only)));
+ DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_file) >= 0);
+ log_descriptor.sent_to_file= lsn;
+ /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+ {
+ log_descriptor.in_buffers_only= in_buffers;
+ DBUG_PRINT("info", ("set new in_buffers_only"));
+ }
+ pthread_mutex_unlock(&log_descriptor.sent_to_file_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Sets address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_set_only_in_buffers()
+ lsn LSN to assign
+ in_buffers to assign to in_buffers_only
+*/
+
+static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers)
+{
+ DBUG_ENTER("translog_set_only_in_buffers");
+ pthread_mutex_lock(&log_descriptor.sent_to_file_lock);
+ DBUG_PRINT("enter", ("in_buffers: (%lu,0x%lx) "
+ "in_buffers_only: (%lu,0x%lx)",
+ LSN_IN_PARTS(in_buffers),
+ LSN_IN_PARTS(log_descriptor.in_buffers_only)));
+ /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+ {
+ log_descriptor.in_buffers_only= in_buffers;
+ DBUG_PRINT("info", ("set new in_buffers_only"));
+ }
+ pthread_mutex_unlock(&log_descriptor.sent_to_file_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Gets address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_only_in_buffers()
+
+ RETURN
+ address from which data is only in the buffer
+*/
+
+static TRANSLOG_ADDRESS translog_only_in_buffers()
+{
+ register TRANSLOG_ADDRESS addr;
+ DBUG_ENTER("translog_only_in_buffers");
+ pthread_mutex_lock(&log_descriptor.sent_to_file_lock);
+ addr= log_descriptor.in_buffers_only;
+ pthread_mutex_unlock(&log_descriptor.sent_to_file_lock);
+ DBUG_RETURN(addr);
+}
+
+
+/*
+ Get max LSN sent to file
+
+ SYNOPSIS
+ translog_get_sent_to_file()
+
+ RETURN
+ max LSN send to file
+*/
+
+static LSN translog_get_sent_to_file()
+{
+ register LSN lsn;
+ DBUG_ENTER("translog_get_sent_to_file");
+ pthread_mutex_lock(&log_descriptor.sent_to_file_lock);
+ lsn= log_descriptor.sent_to_file;
+ pthread_mutex_unlock(&log_descriptor.sent_to_file_lock);
+ DBUG_RETURN(lsn);
+}
+
+
+/*
+ Get first chunk address on the given page
+
+ SYNOPSIS
+ translog_get_first_chunk_offset()
+ page The page where to find first chunk
+
+ RETURN
+ first chunk offset
+*/
+
+static my_bool translog_get_first_chunk_offset(uchar *page)
+{
+ uint16 page_header= 7;
+ DBUG_ENTER("translog_get_first_chunk_offset");
+
+ if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC)
+ page_header+= 4;
+ if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+ page_header+= (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ DBUG_RETURN(page_header);
+}
+
+
+/*
+ Write coded length of record
+
+ SYNOPSIS
+ translog_write_variable_record_1group_code_len
+ dst Destination buffer pointer
+ length Length which should be coded
+ header_len Calculated total header length
+*/
+
+static void
+translog_write_variable_record_1group_code_len(uchar *dst,
+ translog_size_t length,
+ uint16 header_len)
+{
+ switch (header_len) {
+ case 6: /* (5 + 1) */
+ DBUG_ASSERT(length <= 250);
+ *dst= (uint8) length;
+ return;
+ case 8: /* (5 + 3) */
+ DBUG_ASSERT(length <= 0xFFFF);
+ *dst= 251;
+ int2store(dst + 1, length);
+ return;
+ case 9: /* (5 + 4) */
+ DBUG_ASSERT(length <= (ulong) 0xFFFFFF);
+ *dst= 252;
+ int3store(dst + 1, length);
+ return;
+ case 10: /* (5 + 5) */
+ *dst= 253;
+ int4store(dst + 1, length);
+ return;
+ default:
+ DBUG_ASSERT(0);
+ }
+ return;
+}
+
+
+/*
+ Decode record data length and advance given pointer to the next field
+
+ SYNOPSIS
+ translog_variable_record_1group_decode_len()
+ src The pointer to the pointer to the length beginning
+
+ RETURN
+ decoded length
+*/
+
+static translog_size_t translog_variable_record_1group_decode_len(uchar **src)
+{
+ uint8 first= (uint8) (**src);
+ switch (first) {
+ case 251:
+ (*src)+= 3;
+ return (uint2korr((*src) - 2));
+ case 252:
+ (*src)+= 4;
+ return (uint3korr((*src) - 3));
+ case 253:
+ (*src)+= 5;
+ return (uint4korr((*src) - 4));
+ case 254:
+ case 255:
+ DBUG_ASSERT(0); /* reserved for future use */
+ return (0);
+ default:
+ (*src)++;
+ return (first);
+ }
+}
+
+
+/*
+ Get total length of this chunk (not only body)
+
+ SYNOPSIS
+ translog_get_total_chunk_length()
+ page The page where chunk placed
+ offset Offset of the chunk on this place
+
+ RETURN
+ total length of the chunk
+*/
+
+static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset)
+{
+ DBUG_ENTER("translog_get_total_chunk_length");
+ switch (page[offset] & TRANSLOG_CHUNK_TYPE) {
+ case TRANSLOG_CHUNK_LSN:
+ {
+ /* 0 chunk referred as LSN (head or tail) */
+ translog_size_t rec_len;
+ uchar *start= page + offset;
+ uchar *ptr= start + 1 + 2;
+ uint16 chunk_len, header_len, page_rest;
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+ rec_len= translog_variable_record_1group_decode_len(&ptr);
+ chunk_len= uint2korr(ptr);
+ header_len= (ptr -start) + 2;
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u",
+ (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+ if (chunk_len)
+ {
+ DBUG_PRINT("info", ("chunk len: %u + %u = %u",
+ (uint) header_len, (uint) chunk_len,
+ (uint) (chunk_len + header_len)));
+ DBUG_RETURN(chunk_len + header_len);
+ }
+ page_rest= TRANSLOG_PAGE_SIZE - offset;
+ DBUG_PRINT("info", ("page_rest %u", (uint) page_rest));
+ if (rec_len + header_len < page_rest)
+ DBUG_RETURN(rec_len + header_len);
+ DBUG_RETURN(page_rest);
+ }
+ case TRANSLOG_CHUNK_FIXED:
+ {
+ uchar *ptr;
+ uint type= page[offset] & TRANSLOG_REC_TYPE;
+ uint length;
+ int i;
+ /* 1 (pseudo)fixed record (also LSN) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED"));
+ DBUG_ASSERT(log_record_type_descriptor[type].class ==
+ LOGRECTYPE_FIXEDLENGTH ||
+ log_record_type_descriptor[type].class ==
+ LOGRECTYPE_PSEUDOFIXEDLENGTH);
+ if (log_record_type_descriptor[type].class == LOGRECTYPE_FIXEDLENGTH)
+ {
+ DBUG_PRINT("info",
+ ("Fixed length: %u",
+ (uint) (log_record_type_descriptor[type].fixed_length + 3)));
+ DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3);
+ }
+
+ ptr= page + offset + 3; /* first compressed LSN */
+ length= log_record_type_descriptor[type].fixed_length + 3;
+ for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++)
+ {
+ /* first 2 bits is length - 2 */
+ uint len= ((((uint8) (*ptr)) & TRANSLOG_CLSN_LEN_BITS) >> 6) + 2;
+ if (ptr[0] == 0 && ((uint8) ptr[1]) == 1)
+ len+= LSN_STORE_SIZE; /* case of full LSN storing */
+ ptr+= len;
+ /* subtract economized bytes */
+ length-= (LSN_STORE_SIZE - len);
+ }
+ DBUG_PRINT("info", ("Pseudo-fixed length: %u", length));
+ DBUG_RETURN(length);
+ }
+ case TRANSLOG_CHUNK_NOHDR:
+ /* 2 no header chunk (till page end) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u",
+ (uint) (TRANSLOG_PAGE_SIZE - offset)));
+ DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset);
+ case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH"));
+ DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3);
+ DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3));
+ DBUG_RETURN(uint2korr(page + offset + 1) + 3);
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0);
+ }
+}
+
+
+/*
+ Flush given buffer
+
+ SYNOPSIS
+ translog_buffer_flush()
+ buffer This buffer should be flushed
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_buffer_flush(struct st_translog_buffer *buffer)
+{
+ uint32 i;
+ PAGECACHE_FILE file;
+ DBUG_ENTER("translog_buffer_flush");
+ DBUG_PRINT("enter",
+ ("Buffer: #%u 0x%lx: "
+ "file: %d offset: (%lu,0x%lx) size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->file,
+ LSN_IN_PARTS(buffer->offset),
+ (ulong) buffer->size));
+
+ DBUG_ASSERT(buffer->file != -1);
+
+ translog_wait_for_writers(buffer);
+ if (buffer->overlay && buffer->overlay->file != -1)
+ {
+ struct st_translog_buffer *overlay= buffer->overlay;
+ translog_buffer_unlock(buffer);
+ translog_buffer_lock(overlay);
+ translog_wait_for_buffer_free(overlay);
+ translog_buffer_unlock(overlay);
+ translog_buffer_lock(buffer);
+ }
+
+ file.file= buffer->file;
+ for (i= 0; i < buffer->size; i+= TRANSLOG_PAGE_SIZE)
+ {
+ TRANSLOG_ADDRESS addr= (buffer->offset + i);
+ TRANSLOG_VALIDATOR_DATA data;
+ data.addr= &addr;
+ DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
+ if (pagecache_inject(log_descriptor.pagecache,
+ &file,
+ (LSN_OFFSET(buffer->offset) + i) / TRANSLOG_PAGE_SIZE,
+ 3,
+ buffer->buffer + i,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED, 0,
+ &translog_page_validator, (uchar*) &data))
+ {
+ UNRECOVERABLE_ERROR(("Can't write page (%lu,0x%lx) to pagecache",
+ (ulong) buffer->file,
+ (ulong) (LSN_OFFSET(buffer->offset)+ i)));
+ }
+ }
+ if (my_pwrite(buffer->file, (char*) buffer->buffer,
+ buffer->size, LSN_OFFSET(buffer->offset),
+ log_write_flags))
+ {
+ UNRECOVERABLE_ERROR(("Can't write buffer (%lu,0x%lx) size %lu "
+ "to the disk (%d)",
+ (ulong) buffer->file,
+ (ulong) LSN_OFFSET(buffer->offset),
+ (ulong) buffer->size, errno));
+ DBUG_RETURN(1);
+ }
+
+ if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */
+ translog_set_sent_to_file(buffer->last_lsn,
+ buffer->next_buffer_offset);
+ else
+ translog_set_only_in_buffers(buffer->next_buffer_offset);
+ /* Free buffer */
+ buffer->file= -1;
+ buffer->overlay= 0;
+ if (buffer->waiting_filling_buffer.last_thread)
+ {
+ wqueue_release_queue(&buffer->waiting_filling_buffer);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Recover page with sector protection (wipe out failed chunks)
+
+ SYNOPSYS
+ translog_recover_page_up_to_sector()
+ page reference on the page
+ offset offset of failed sector
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset)
+{
+ uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end;
+ DBUG_ENTER("translog_recover_page_up_to_sector");
+ DBUG_PRINT("enter", ("offset: %u first chunk: %u",
+ (uint) offset, (uint) chunk_offset));
+
+ while (page[chunk_offset] != '\0' && chunk_offset < offset)
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ {
+ UNRECOVERABLE_ERROR(("cant get chunk length (offset %u)",
+ (uint) chunk_offset));
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("info", ("chunk: offset: %u length %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE)
+ {
+ UNRECOVERABLE_ERROR(("damaged chunk (offset %u) in trusted area",
+ (uint) chunk_offset));
+ DBUG_RETURN(1);
+ }
+ chunk_offset+= chunk_length;
+ }
+
+ valid_chunk_end= chunk_offset;
+ /* end of trusted area - sector parsing */
+ while (page[chunk_offset] != '\0')
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ break;
+
+ DBUG_PRINT("info", ("chunk: offset: %u length %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ if (((ulong) chunk_offset) + ((ulong) chunk_length) >
+ (uint) (offset + DISK_DRIVE_SECTOR_SIZE))
+ break;
+
+ chunk_offset+= chunk_length;
+ valid_chunk_end= chunk_offset;
+ }
+ DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end));
+
+ bzero(page + valid_chunk_end, TRANSLOG_PAGE_SIZE - valid_chunk_end);
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Log page validator
+
+ SYNOPSIS
+ translog_page_validator()
+ page_addr The page to check
+ data data, need for validation (address in this case)
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+static my_bool translog_page_validator(uchar *page_addr, uchar* data_ptr)
+{
+ uint this_page_page_overhead;
+ uint flags;
+ uchar *page= (uchar*) page_addr, *page_pos;
+ TRANSLOG_VALIDATOR_DATA *data= (TRANSLOG_VALIDATOR_DATA *) data_ptr;
+ TRANSLOG_ADDRESS addr= *(data->addr);
+ DBUG_ENTER("translog_page_validator");
+
+ data->was_recovered= 0;
+
+ if (uint3korr(page) != LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE ||
+ uint3korr(page + 3) != LSN_FILE_NO(addr))
+ {
+ UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): "
+ "page address written in the page is incorrect: "
+ "File %lu instead of %lu or page %lu instead of %lu",
+ LSN_IN_PARTS(addr),
+ (ulong) uint3korr(page + 3), (ulong) LSN_FILE_NO(addr),
+ (ulong) uint3korr(page),
+ (ulong) LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE));
+ DBUG_RETURN(1);
+ }
+ flags= (uint)(page[TRANSLOG_PAGE_FLAGS]);
+ this_page_page_overhead= page_overhead[flags];
+ if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+ TRANSLOG_RECORD_CRC))
+ {
+ UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): "
+ "Garbage in the page flags field detected : %x",
+ LSN_IN_PARTS(addr), (uint) flags));
+ DBUG_RETURN(1);
+ }
+ page_pos= page + (3 + 3 + 1);
+ if (flags & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(page + this_page_page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ this_page_page_overhead);
+ if (crc != uint4korr(page_pos))
+ {
+ UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): "
+ "CRC mismatch: calculated: %lx on the page %lx",
+ LSN_IN_PARTS(addr),
+ (ulong) crc, (ulong) uint4korr(page_pos)));
+ DBUG_RETURN(1);
+ }
+ page_pos+= CRC_LENGTH; /* Skip crc */
+ }
+ if (flags & TRANSLOG_SECTOR_PROTECTION)
+ {
+ uint i, offset;
+ uchar *table= page_pos;
+ uint16 current= uint2korr(table);
+ for (i= 2, offset= DISK_DRIVE_SECTOR_SIZE;
+ i < (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ i+= 2, offset+= DISK_DRIVE_SECTOR_SIZE)
+ {
+ /*
+ TODO: add chunk counting for "suspecting" sectors (difference is
+ more than 1-2)
+ */
+ uint16 test= uint2korr(page + offset);
+ DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx "
+ "read: 0x%x stored: 0x%x%x",
+ i / 2, offset, (ulong) current,
+ (uint) uint2korr(page + offset), (uint) table[i],
+ (uint) table[i + 1]));
+ if (((test < current) &&
+ (LL(0xFFFF) - current + test > DISK_DRIVE_SECTOR_SIZE / 3)) ||
+ ((test >= current) &&
+ (test - current > DISK_DRIVE_SECTOR_SIZE / 3)))
+ {
+ if (translog_recover_page_up_to_sector(page, offset))
+ DBUG_RETURN(1);
+ data->was_recovered= 1;
+ DBUG_RETURN(0);
+ }
+
+ /* Return value on the page */
+ page[offset]= table[i];
+ page[offset + 1]= table[i + 1];
+ current= test;
+ DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx "
+ "read: 0x%x stored: 0x%x%x",
+ i / 2, offset, (ulong) current,
+ (uint) uint2korr(page + offset), (uint) table[i],
+ (uint) table[i + 1]));
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Lock the loghandler
+
+ SYNOPSIS
+ translog_lock()
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+my_bool translog_lock()
+{
+ struct st_translog_buffer *current_buffer;
+ DBUG_ENTER("translog_lock");
+
+ /*
+ Locking the loghandler mean locking current buffer, but it can change
+ during locking, so we should check it
+ */
+ for (;;)
+ {
+ current_buffer= log_descriptor.bc.buffer;
+ if (translog_buffer_lock(current_buffer))
+ DBUG_RETURN(1);
+ if (log_descriptor.bc.buffer == current_buffer)
+ break;
+ translog_buffer_unlock(current_buffer);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Unlock the loghandler
+
+ SYNOPSIS
+ translog_unlock()
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+my_bool translog_unlock()
+{
+ DBUG_ENTER("translog_unlock");
+ translog_buffer_unlock(log_descriptor.bc.buffer);
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get log page by file number and offset of the beginning of the page
+
+ SYNOPSIS
+ translog_get_page()
+ data validator data, which contains the page address
+ buffer buffer for page placing
+ (might not be used in some cache implementations)
+
+ RETURN
+ NULL - Error
+ # pointer to the page cache which should be used to read this page
+*/
+
+static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer)
+{
+ TRANSLOG_ADDRESS addr= *(data->addr), in_buffers;
+ uint cache_index;
+ uint32 file_no= LSN_FILE_NO(addr);
+ DBUG_ENTER("translog_get_page");
+ DBUG_PRINT("enter", ("File: %lu Offset: %lu(0x%lx)",
+ (ulong) file_no,
+ (ulong) LSN_OFFSET(addr),
+ (ulong) LSN_OFFSET(addr)));
+
+ /* it is really page address */
+ DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0);
+
+ in_buffers= translog_only_in_buffers();
+ DBUG_PRINT("info", ("in_buffers: (%lu,0x%lx)",
+ LSN_IN_PARTS(in_buffers)));
+ if (in_buffers != LSN_IMPOSSIBLE &&
+ cmp_translog_addr(addr, in_buffers) >= 0)
+ {
+ translog_lock();
+ /* recheck with locked loghandler */
+ in_buffers= translog_only_in_buffers();
+ if (cmp_translog_addr(addr, in_buffers) >= 0)
+ {
+ uint16 buffer_no= log_descriptor.bc.buffer_no;
+ uint16 buffer_start= buffer_no;
+ struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer;
+ struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer;
+ for (;;)
+ {
+ /*
+ if the page is in the buffer and it is the last version of the
+ page (in case of devision the page bu buffer flush
+ */
+ if (curr_buffer->file != -1 &&
+ cmp_translog_addr(addr, curr_buffer->offset) >= 0 &&
+ cmp_translog_addr(addr,
+ (curr_buffer->next_buffer_offset ?
+ curr_buffer->next_buffer_offset:
+ curr_buffer->offset + curr_buffer->size)) < 0)
+ {
+ int is_last_unfinished_page;
+ uint last_protected_sector= 0;
+ uchar *from, *table= NULL;
+ translog_wait_for_writers(curr_buffer);
+ DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
+ from= curr_buffer->buffer + (addr - curr_buffer->offset);
+ memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ is_last_unfinished_page= ((log_descriptor.bc.buffer ==
+ curr_buffer) &&
+ (log_descriptor.bc.ptr >= from) &&
+ (log_descriptor.bc.ptr <
+ from + TRANSLOG_PAGE_SIZE));
+ if (is_last_unfinished_page &&
+ (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION))
+ {
+ last_protected_sector= ((log_descriptor.bc.previous_offset - 1) /
+ DISK_DRIVE_SECTOR_SIZE);
+ table= buffer + log_descriptor.page_overhead -
+ (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2;
+ }
+
+ DBUG_ASSERT(buffer_unlock == curr_buffer);
+ translog_buffer_unlock(buffer_unlock);
+ if (is_last_unfinished_page)
+ {
+ uint i;
+ /*
+ This is last unfinished page => we should not check CRC and
+ remove only that protection which already installed (no need
+ to check it)
+
+ We do not check the flag of sector protection, because if
+ (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is
+ not set then last_protected_sector will be 0 so following loop
+ will be never executed
+ */
+ DBUG_PRINT("info", ("This is last unfinished page, "
+ "last protected sector %u",
+ last_protected_sector));
+ for (i= 1; i <= last_protected_sector; i++)
+ {
+ uint index= i * 2;
+ uint offset= i * DISK_DRIVE_SECTOR_SIZE;
+ DBUG_PRINT("info", ("Sector %u: 0x%02x%02x <- 0x%02x%02x",
+ i, buffer[offset], buffer[offset + 1],
+ table[index], table[index + 1]));
+ buffer[offset]= table[index];
+ buffer[offset + 1]= table[index + 1];
+ }
+ }
+ else
+ {
+ /*
+ This IF should be true because we use in-memory data which
+ supposed to be correct.
+ */
+ if (translog_page_validator((uchar*) buffer, (uchar*) data))
+ buffer= NULL;
+ }
+ DBUG_RETURN(buffer);
+ }
+ buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ curr_buffer= log_descriptor.buffers + buffer_no;
+ translog_buffer_lock(curr_buffer);
+ translog_buffer_unlock(buffer_unlock);
+ buffer_unlock= curr_buffer;
+ /* we can't make full circle */
+ DBUG_ASSERT(buffer_start != buffer_no);
+ }
+ }
+ translog_unlock();
+ }
+ if ((cache_index= LSN_FILE_NO(log_descriptor.horizon) - file_no) <
+ OPENED_FILES_NUM)
+ {
+ PAGECACHE_FILE file;
+ /* file in the cache */
+ if (log_descriptor.log_file_num[cache_index] == -1)
+ {
+ if ((log_descriptor.log_file_num[cache_index]=
+ open_logfile_by_number_no_cache(file_no)) == -1)
+ DBUG_RETURN(NULL);
+ }
+ file.file= log_descriptor.log_file_num[cache_index];
+
+ buffer= (uchar*)
+ pagecache_valid_read(log_descriptor.pagecache, &file,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, (char*) buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0,
+ &translog_page_validator, (uchar*) data);
+ }
+ else
+ {
+ /*
+ TODO: WE KEEP THE LAST OPENED_FILES_NUM FILES IN THE LOG CACHE, NOT
+ THE LAST USED FILES. THIS WILL BE A NOTABLE PROBLEM IF WE ARE
+ FOLLOWING AN UNDO CHAIN THAT GOES OVER MANY OLD LOG FILES. WE WILL
+ PROBABLY NEED SPECIAL HANDLING OF THIS OR HAVE A FILO FOR THE LOG
+ FILES.
+ */
+
+ File file= open_logfile_by_number_no_cache(file_no);
+ if (file == -1)
+ DBUG_RETURN(NULL);
+ if (my_pread(file, (char*) buffer, TRANSLOG_PAGE_SIZE,
+ LSN_OFFSET(addr), MYF(MY_FNABP | MY_WME)))
+ buffer= NULL;
+ else if (translog_page_validator((uchar*) buffer, (uchar*) data))
+ buffer= NULL;
+ my_close(file, MYF(MY_WME));
+ }
+ DBUG_RETURN(buffer);
+}
+
+
+/*
+ Finds last page of the given log file
+
+ SYNOPSIS
+ translog_get_last_page_addr()
+ addr address structure to fill with data, which contain
+ file number of the log file
+ last_page_ok assigned 1 if last page was OK
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr,
+ my_bool *last_page_ok)
+{
+ MY_STAT stat_buff, *stat;
+ char path[FN_REFLEN];
+ uint32 rec_offset;
+ uint32 file_no= LSN_FILE_NO(*addr);
+ DBUG_ENTER("translog_get_last_page_addr");
+
+ if (!(stat= my_stat(translog_filename_by_fileno(file_no, path),
+ &stat_buff, MYF(MY_WME))))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("File size: %lu", (ulong) stat->st_size));
+ if (stat->st_size > TRANSLOG_PAGE_SIZE)
+ {
+ rec_offset= (((stat->st_size / TRANSLOG_PAGE_SIZE) - 1) *
+ TRANSLOG_PAGE_SIZE);
+ *last_page_ok= (stat->st_size == rec_offset + TRANSLOG_PAGE_SIZE);
+ }
+ else
+ {
+ *last_page_ok= 0;
+ rec_offset= 0;
+ }
+ *addr= MAKE_LSN(file_no, rec_offset);
+ DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset,
+ *last_page_ok));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get number bytes for record length storing
+
+ SYNOPSIS
+ translog_variable_record_length_bytes()
+ length Record length wich will be codded
+
+ RETURN
+ 1,3,4,5 - number of bytes to store given length
+*/
+
+static uint translog_variable_record_length_bytes(translog_size_t length)
+{
+ if (length < 250)
+ return 1;
+ if (length < 0xFFFF)
+ return 3;
+ if (length < (ulong) 0xFFFFFF)
+ return 4;
+ return 5;
+}
+
+
+/*
+ Get header of this chunk
+
+ SYNOPSIS
+ translog_get_chunk_header_length()
+ page The page where chunk placed
+ offset Offset of the chunk on this place
+
+ RETURN
+ # total length of the chunk
+ 0 Error
+*/
+
+static uint16 translog_get_chunk_header_length(uchar *page, uint16 offset)
+{
+ DBUG_ENTER("translog_get_chunk_header_length");
+ page+= offset;
+ switch (*page & TRANSLOG_CHUNK_TYPE) {
+ case TRANSLOG_CHUNK_LSN:
+ {
+ /* 0 chunk referred as LSN (head or tail) */
+ translog_size_t rec_len;
+ uchar *start= page;
+ uchar *ptr= start + 1 + 2;
+ uint16 chunk_len, header_len;
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+ rec_len= translog_variable_record_1group_decode_len(&ptr);
+ chunk_len= uint2korr(ptr);
+ header_len= (ptr - start) +2;
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u",
+ (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+ if (chunk_len)
+ {
+ /* TODO: fine header end */
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0); /* Keep compiler happy */
+ }
+ DBUG_RETURN(header_len);
+ }
+ case TRANSLOG_CHUNK_FIXED:
+ {
+ /* 1 (pseudo)fixed record (also LSN) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3"));
+ DBUG_RETURN(3);
+ }
+ case TRANSLOG_CHUNK_NOHDR:
+ /* 2 no header chunk (till page end) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1"));
+ DBUG_RETURN(1);
+ break;
+ case TRANSLOG_CHUNK_LNGTH:
+ /* 3 chunk with chunk length */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3"));
+ DBUG_RETURN(3);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0); /* Keep compiler happy */
+ }
+}
+
+
+/*
+ Initialize transaction log
+
+ SYNOPSIS
+ translog_init()
+ directory Directory where log files are put
+ log_file_max_size max size of one log size (for new logs creation)
+ server_version version of MySQL server (MYSQL_VERSION_ID)
+ server_id server ID (replication & Co)
+ pagecache Page cache for the log reads
+ flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION
+ TRANSLOG_RECORD_CRC)
+
+ TODO
+ Free used resources in case of error.
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+my_bool translog_init(const char *directory,
+ uint32 log_file_max_size,
+ uint32 server_version,
+ uint32 server_id, PAGECACHE *pagecache, uint flags)
+{
+ int i;
+ int old_log_was_recovered= 0, logs_found= 0;
+ uint old_flags= flags;
+ TRANSLOG_ADDRESS sure_page, last_page, last_valid_page;
+ my_bool version_changed= 0;
+ DBUG_ENTER("translog_init");
+ DBUG_ASSERT(translog_inited == 0);
+
+ loghandler_init(); /* Safe to do many times */
+
+ if (pthread_mutex_init(&log_descriptor.sent_to_file_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.file_header_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.unfinished_files_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.purger_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.log_flush_lock,
+ MY_MUTEX_INIT_FAST) ||
+ init_dynamic_array(&log_descriptor.unfinished_files,
+ sizeof(struct st_file_counter),
+ 10, 10 CALLER_INFO))
+ DBUG_RETURN(1);
+ log_descriptor.min_file_number= 0;
+ log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE;
+
+ /* Directory to store files */
+ unpack_dirname(log_descriptor.directory, directory);
+
+ if ((log_descriptor.directory_fd= my_open(log_descriptor.directory,
+ O_RDONLY, MYF(MY_WME))) < 0)
+ {
+ UNRECOVERABLE_ERROR(("Error %d during opening directory '%s'",
+ errno, log_descriptor.directory));
+ DBUG_RETURN(1);
+ }
+
+ log_descriptor.in_buffers_only= LSN_IMPOSSIBLE;
+ /* max size of one log size (for new logs creation) */
+ log_descriptor.log_file_max_size=
+ log_file_max_size - (log_file_max_size % TRANSLOG_PAGE_SIZE);
+ /* server version */
+ log_descriptor.server_version= server_version;
+ /* server ID */
+ log_descriptor.server_id= server_id;
+ /* Page cache for the log reads */
+ log_descriptor.pagecache= pagecache;
+ /* Flags */
+ DBUG_ASSERT((flags &
+ ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+ TRANSLOG_RECORD_CRC)) == 0);
+ log_descriptor.flags= flags;
+ for (i= 0; i < TRANSLOG_FLAGS_NUM; i++)
+ {
+ page_overhead[i]= 7;
+ if (i & TRANSLOG_PAGE_CRC)
+ page_overhead[i]+= CRC_LENGTH;
+ if (i & TRANSLOG_SECTOR_PROTECTION)
+ page_overhead[i]+= (TRANSLOG_PAGE_SIZE /
+ DISK_DRIVE_SECTOR_SIZE) * 2;
+ }
+ log_descriptor.page_overhead= page_overhead[flags];
+ log_descriptor.page_capacity_chunk_2=
+ TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1;
+ DBUG_ASSERT(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0);
+ log_descriptor.buffer_capacity_chunk_2=
+ (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) *
+ log_descriptor.page_capacity_chunk_2;
+ log_descriptor.half_buffer_capacity_chunk_2=
+ log_descriptor.buffer_capacity_chunk_2 / 2;
+ DBUG_PRINT("info",
+ ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u",
+ log_descriptor.page_overhead,
+ log_descriptor.page_capacity_chunk_2,
+ log_descriptor.buffer_capacity_chunk_2,
+ log_descriptor.half_buffer_capacity_chunk_2));
+
+ /* *** Current state of the log handler *** */
+
+ /* Init log handler file handlers cache */
+ for (i= 0; i < OPENED_FILES_NUM; i++)
+ log_descriptor.log_file_num[i]= -1;
+
+ /* just to init it somehow */
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+
+ /* Buffers for log writing */
+ for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+ {
+ if (translog_buffer_init(log_descriptor.buffers + i))
+ DBUG_RETURN(1);
+#ifndef DBUG_OFF
+ log_descriptor.buffers[i].buffer_no= (uint8) i;
+#endif
+ DBUG_PRINT("info", ("translog_buffer buffer #%u: 0x%lx",
+ i, (ulong) log_descriptor.buffers + i));
+ }
+
+ logs_found= (last_logno != FILENO_IMPOSSIBLE);
+
+ if (logs_found)
+ {
+ my_bool pageok;
+ /*
+ TODO: scan directory for maria_log.XXXXXXXX files and find
+ highest XXXXXXXX & set logs_found
+ TODO: check that last checkpoint within present log addresses space
+
+ find the log end
+ */
+ if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE)
+ {
+ DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0);
+ /* there was no checkpoints we will read from the beginning */
+ sure_page= (LSN_ONE_FILE | TRANSLOG_PAGE_SIZE);
+ }
+ else
+ {
+ sure_page= last_checkpoint_lsn;
+ DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0);
+ sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE;
+ }
+ log_descriptor.horizon= last_page= MAKE_LSN(last_logno,0);
+ if (translog_get_last_page_addr(&last_page, &pageok))
+ DBUG_RETURN(1);
+ if (LSN_OFFSET(last_page) == 0)
+ {
+ if (LSN_FILE_NO(last_page) == 1)
+ {
+ logs_found= 0; /* file #1 has no pages */
+ }
+ else
+ {
+ last_page-= LSN_ONE_FILE;
+ if (translog_get_last_page_addr(&last_page, &pageok))
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ if (logs_found)
+ {
+ TRANSLOG_ADDRESS current_page= sure_page;
+ my_bool pageok;
+
+ DBUG_ASSERT(sure_page <= last_page);
+
+ /* TODO: check page size */
+
+ last_valid_page= LSN_IMPOSSIBLE;
+ /* scan and validate pages */
+ do
+ {
+ TRANSLOG_ADDRESS current_file_last_page;
+ current_file_last_page= current_page;
+ if (translog_get_last_page_addr(&current_file_last_page, &pageok))
+ DBUG_RETURN(1);
+ if (!pageok)
+ {
+ DBUG_PRINT("error", ("File %lu have no complete last page",
+ (ulong) LSN_FILE_NO(current_file_last_page)));
+ old_log_was_recovered= 1;
+ /* This file is not written till the end so it should be last */
+ last_page= current_file_last_page;
+ /* TODO: issue warning */
+ }
+ do
+ {
+ TRANSLOG_VALIDATOR_DATA data;
+ uchar buffer[TRANSLOG_PAGE_SIZE], *page;
+ data.addr= &current_page;
+ if ((page= translog_get_page(&data, buffer)) == NULL)
+ DBUG_RETURN(1);
+ if (data.was_recovered)
+ {
+ DBUG_PRINT("error", ("file no: %lu (%d) "
+ "rec_offset: 0x%lx (%lu) (%d)",
+ (ulong) LSN_FILE_NO(current_page),
+ (uint3korr(page + 3) !=
+ LSN_FILE_NO(current_page)),
+ (ulong) LSN_OFFSET(current_page),
+ (ulong) (LSN_OFFSET(current_page) /
+ TRANSLOG_PAGE_SIZE),
+ (uint3korr(page) !=
+ LSN_OFFSET(current_page) /
+ TRANSLOG_PAGE_SIZE)));
+ old_log_was_recovered= 1;
+ break;
+ }
+ old_flags= page[TRANSLOG_PAGE_FLAGS];
+ last_valid_page= current_page;
+ current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */
+ } while (current_page <= current_file_last_page);
+ current_page+= LSN_ONE_FILE;
+ current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE);
+ } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) &&
+ !old_log_was_recovered);
+ if (last_valid_page == LSN_IMPOSSIBLE)
+ {
+ /* Panic!!! Even page which should be valid is invalid */
+ /* TODO: issue error */
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("info", ("Last valid page is in file: %lu "
+ "offset: %lu (0x%lx) "
+ "Logs found: %d was recovered: %d "
+ "flags match: %d",
+ (ulong) LSN_FILE_NO(last_valid_page),
+ (ulong) LSN_OFFSET(last_valid_page),
+ (ulong) LSN_OFFSET(last_valid_page),
+ logs_found, old_log_was_recovered,
+ (old_flags == flags)));
+
+ /* TODO: check server ID */
+ if (logs_found && !old_log_was_recovered && old_flags == flags)
+ {
+ TRANSLOG_VALIDATOR_DATA data;
+ uchar buffer[TRANSLOG_PAGE_SIZE], *page;
+ uint16 chunk_offset;
+ data.addr= &last_valid_page;
+ /* continue old log */
+ DBUG_ASSERT(LSN_FILE_NO(last_valid_page)==
+ LSN_FILE_NO(log_descriptor.horizon));
+ if ((page= translog_get_page(&data, buffer)) == NULL ||
+ (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+ DBUG_RETURN(1);
+
+ /* Puts filled part of old page in the buffer */
+ log_descriptor.horizon= last_valid_page;
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ /*
+ Free space if filled with 0 and first uchar of
+ real chunk can't be 0
+ */
+ while (chunk_offset < TRANSLOG_PAGE_SIZE && page[chunk_offset] != '\0')
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("chunk: offset: %u length: %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ chunk_offset+= chunk_length;
+
+ /* chunk can't cross the page border */
+ DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE);
+ }
+ memcpy(log_descriptor.buffers->buffer, page, chunk_offset);
+ log_descriptor.bc.buffer->size+= chunk_offset;
+ log_descriptor.bc.ptr+= chunk_offset;
+ log_descriptor.bc.current_page_fill= chunk_offset;
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ (chunk_offset +
+ LSN_OFFSET(last_valid_page)));
+ DBUG_PRINT("info", ("Move Page #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) log_descriptor.bc.buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr - log_descriptor.bc.
+ buffer->buffer)));
+ DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc););
+ }
+ if (!old_log_was_recovered && old_flags == flags)
+ {
+ LOGHANDLER_FILE_INFO info;
+ if (translog_read_file_header(&info, log_descriptor.log_file_num[0]))
+ DBUG_RETURN(1);
+ version_changed= (info.maria_version != TRANSLOG_VERSION_ID);
+ }
+ }
+ DBUG_PRINT("info", ("Logs found: %d was recovered: %d",
+ logs_found, old_log_was_recovered));
+ if (!logs_found)
+ {
+ /* Start new log system from scratch */
+ /* Used space */
+ log_descriptor.horizon= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* header page */
+ /* Current logs file number in page cache */
+ if ((log_descriptor.log_file_num[0]=
+ open_logfile_by_number_no_cache(1)) == -1 ||
+ translog_write_file_header())
+ DBUG_RETURN(1);
+ if (ma_control_file_write_and_force(LSN_IMPOSSIBLE, 1,
+ CONTROL_FILE_UPDATE_ONLY_LOGNO))
+ DBUG_RETURN(1);
+ /* assign buffer 0 */
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ }
+ else if (old_log_was_recovered || old_flags != flags || version_changed)
+ {
+ /* leave the damaged file untouched */
+ log_descriptor.horizon+= LSN_ONE_FILE;
+ /* header page */
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ TRANSLOG_PAGE_SIZE);
+ if (translog_create_new_file())
+ DBUG_RETURN(1);
+ /*
+ Buffer system left untouched after recovery => we should init it
+ (starting from buffer 0)
+ */
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ }
+
+ /* all LSNs that are on disk are flushed */
+ log_descriptor.sent_to_file=
+ log_descriptor.flushed= log_descriptor.horizon;
+ log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
+ log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
+ /*
+ horizon is (potentially) address of the next LSN we need decrease
+ it to signal that all LSNs before it are flushed
+ */
+ log_descriptor.flushed--; /* offset decreased */
+ log_descriptor.sent_to_file--; /* offset decreased */
+ /*
+ Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up
+ structures for generating 2-byte ids:
+ */
+ my_atomic_rwlock_init(&LOCK_id_to_share);
+ id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*),
+ MYF(MY_WME | MY_ZEROFILL));
+ if (unlikely(!id_to_share))
+ DBUG_RETURN(1);
+ id_to_share--; /* min id is 1 */
+ translog_inited= 1;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Free transaction log file buffer
+
+ SYNOPSIS
+ translog_buffer_destroy()
+ buffer_no The buffer to free
+
+ NOTE
+ This buffer should be locked
+*/
+
+static void translog_buffer_destroy(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_destroy");
+ DBUG_PRINT("enter",
+ ("Buffer #%u: 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->file,
+ LSN_IN_PARTS(buffer->offset),
+ (ulong) buffer->size));
+ DBUG_ASSERT(buffer->waiting_filling_buffer.last_thread == 0);
+ if (buffer->file != -1)
+ {
+ /*
+ We ignore errors here, because we can't do something about it
+ (it is shutting down)
+ */
+ translog_buffer_flush(buffer);
+ }
+ DBUG_PRINT("info", ("Destroy mutex: 0x%lx", (ulong) &buffer->mutex));
+ pthread_mutex_destroy(&buffer->mutex);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Free log handler resources
+
+ SYNOPSIS
+ translog_destroy()
+*/
+
+void translog_destroy()
+{
+ uint i;
+ DBUG_ENTER("translog_destroy");
+
+ if (translog_inited)
+ {
+ if (log_descriptor.bc.buffer->file != -1)
+ translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc);
+
+ for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+ {
+ struct st_translog_buffer *buffer= log_descriptor.buffers + i;
+ translog_buffer_destroy(buffer);
+ }
+
+ /* close files */
+ for (i= 0; i < OPENED_FILES_NUM; i++)
+ {
+ if (log_descriptor.log_file_num[i] != -1)
+ translog_close_log_file(log_descriptor.log_file_num[i]);
+ }
+ pthread_mutex_destroy(&log_descriptor.sent_to_file_lock);
+ pthread_mutex_destroy(&log_descriptor.file_header_lock);
+ pthread_mutex_destroy(&log_descriptor.unfinished_files_lock);
+ pthread_mutex_destroy(&log_descriptor.purger_lock);
+ pthread_mutex_destroy(&log_descriptor.log_flush_lock);
+ delete_dynamic(&log_descriptor.unfinished_files);
+
+ my_close(log_descriptor.directory_fd, MYF(MY_WME));
+ my_atomic_rwlock_destroy(&LOCK_id_to_share);
+ my_free((uchar*)(id_to_share + 1), MYF(MY_ALLOW_ZERO_PTR));
+ translog_inited= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+
+
+#define translog_buffer_lock_assert_owner(B) \
+ safe_mutex_assert_owner(&B->mutex);
+void translog_lock_assert_owner()
+{
+ translog_buffer_lock_assert_owner(log_descriptor.bc.buffer);
+}
+
+
+/*
+ Start new page
+
+ SYNOPSIS
+ translog_page_next()
+ horizon \ Position in file and buffer where we are
+ cursor /
+ prev_buffer Buffer which should be flushed will be assigned
+ here if it is need. This is always set.
+
+ NOTE
+ handler should be locked
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ struct st_translog_buffer **prev_buffer)
+{
+ struct st_translog_buffer *buffer= cursor->buffer;
+ DBUG_ENTER("translog_page_next");
+
+ if ((cursor->ptr +TRANSLOG_PAGE_SIZE >
+ cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) ||
+ (LSN_OFFSET(*horizon) >
+ log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE))
+ {
+ DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d "
+ "File size: %lu max: %lu => %d",
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer),
+ (cursor->ptr + TRANSLOG_PAGE_SIZE >
+ cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER),
+ (ulong) LSN_OFFSET(*horizon),
+ (ulong) log_descriptor.log_file_max_size,
+ (LSN_OFFSET(*horizon) >
+ (log_descriptor.log_file_max_size -
+ TRANSLOG_PAGE_SIZE))));
+ if (translog_buffer_next(horizon, cursor,
+ LSN_OFFSET(*horizon) >
+ (log_descriptor.log_file_max_size -
+ TRANSLOG_PAGE_SIZE)))
+ DBUG_RETURN(1);
+ *prev_buffer= buffer;
+ DBUG_PRINT("info", ("Buffer #%u (0x%lu): have to be flushed",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Use the same buffer #%u (0x%lu): "
+ "Buffer Size: %lu (%lu)",
+ (uint) buffer->buffer_no,
+ (ulong) buffer,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ translog_finish_page(horizon, cursor);
+ translog_new_page_header(horizon, cursor);
+ *prev_buffer= NULL;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write data of given length to the current page
+
+ SYNOPSIS
+ translog_write_data_on_page()
+ horizon \ Pointers on file and buffer
+ cursor /
+ length IN length of the chunk
+ buffer buffer with data
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ translog_size_t length,
+ uchar *buffer)
+{
+ DBUG_ENTER("translog_write_data_on_page");
+ DBUG_PRINT("enter", ("Chunk length: %lu Page size %u",
+ (ulong) length, (uint) cursor->current_page_fill));
+ DBUG_ASSERT(length > 0);
+ DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(length + cursor->ptr <=cursor->buffer->buffer +
+ TRANSLOG_WRITE_BUFFER);
+
+ memcpy(cursor->ptr, buffer, length);
+ cursor->ptr+= length;
+ (*horizon)+= length; /* adds offset */
+ cursor->current_page_fill+= length;
+ if (!cursor->chaser)
+ cursor->buffer->size+= length;
+ DBUG_PRINT("info", ("Write data buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write data from parts of given length to the current page
+
+ SYNOPSIS
+ translog_write_parts_on_page()
+ horizon \ Pointers on file and buffer
+ cursor /
+ length IN length of the chunk
+ parts IN/OUT chunk source
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ translog_size_t length,
+ struct st_translog_parts *parts)
+{
+ translog_size_t left= length;
+ uint cur= (uint) parts->current;
+ DBUG_ENTER("translog_write_parts_on_page");
+ DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u "
+ "Buffer size: %lu (%lu)",
+ (ulong) length,
+ (uint) (cur + 1), (uint) parts->elements,
+ (uint) cursor->current_page_fill,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_ASSERT(length > 0);
+ DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(length + cursor->ptr <=cursor->buffer->buffer +
+ TRANSLOG_WRITE_BUFFER);
+
+ do
+ {
+ translog_size_t len;
+ LEX_STRING *part;
+ uchar *buff;
+
+ DBUG_ASSERT(cur < parts->elements);
+ part= parts->parts + cur;
+ buff= (uchar*) part->str;
+ DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: 0x%lx",
+ (uint) (cur + 1), (ulong) part->length, (ulong) left,
+ (ulong) buff));
+
+ if (part->length > left)
+ {
+ /* we should write less then the current part */
+ len= left;
+ part->length-= len;
+ part->str+= len;
+ DBUG_PRINT("info", ("Set new part: %u Length: %lu",
+ (uint) (cur + 1), (ulong) part->length));
+ }
+ else
+ {
+ len= part->length;
+ cur++;
+ DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len));
+ }
+ DBUG_PRINT("info", ("copy: 0x%lx <- 0x%lx %u",
+ (ulong) cursor->ptr, (ulong)buff, (uint)len));
+ if (likely(len))
+ {
+ memcpy(cursor->ptr, buff, len);
+ left-= len;
+ cursor->ptr+= len;
+ }
+ } while (left);
+
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx) Length %lu(0x%lx)",
+ LSN_IN_PARTS(*horizon),
+ (ulong) length, (ulong) length));
+ parts->current= cur;
+ (*horizon)+= length; /* offset increasing */
+ cursor->current_page_fill+= length;
+ if (!cursor->chaser)
+ cursor->buffer->size+= length;
+ DBUG_PRINT("info", ("Write parts buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu) "
+ "Horizon: (%lu,0x%lx) buff offset: 0x%lx",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer),
+ LSN_IN_PARTS(*horizon),
+ (ulong) (LSN_OFFSET(cursor->buffer->offset) +
+ cursor->buffer->size)));
+ DBUG_EXECUTE("info", translog_check_cursor(cursor););
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Put 1 group chunk type 0 header into parts array
+
+ SYNOPSIS
+ translog_write_variable_record_1group_header()
+ parts Descriptor of record source parts
+ type The log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ header_length Calculated header length of chunk type 0
+ chunk0_header Buffer for the chunk header writing
+*/
+
+static void
+translog_write_variable_record_1group_header(struct st_translog_parts *parts,
+ enum translog_record_type type,
+ SHORT_TRANSACTION_ID short_trid,
+ uint16 header_length,
+ uchar *chunk0_header)
+{
+ LEX_STRING *part;
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (part->length= header_length);
+ part->str= (char*)chunk0_header;
+ /* puts chunk type */
+ *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN);
+ int2store(chunk0_header + 1, short_trid);
+ /* puts record length */
+ translog_write_variable_record_1group_code_len(chunk0_header + 3,
+ parts->record_length,
+ header_length);
+ /* puts 0 as chunk length which indicate 1 group record */
+ int2store(chunk0_header + header_length - 2, 0);
+}
+
+
+/*
+ Increase number of writers for this buffer
+
+ SYNOPSIS
+ translog_buffer_increase_writers()
+ buffer target buffer
+*/
+
+static inline void
+translog_buffer_increase_writers(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_increase_writers");
+ buffer->copy_to_buffer_in_progress++;
+ DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx: %d",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->copy_to_buffer_in_progress));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Decrease number of writers for this buffer
+
+ SYNOPSIS
+ translog_buffer_decrease_writers()
+ buffer target buffer
+*/
+
+
+static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_decrease_writers");
+ buffer->copy_to_buffer_in_progress--;
+ DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx: %d",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->copy_to_buffer_in_progress));
+ if (buffer->copy_to_buffer_in_progress == 0 &&
+ buffer->waiting_filling_buffer.last_thread != NULL)
+ wqueue_release_queue(&buffer->waiting_filling_buffer);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Put chunk 2 from new page beginning
+
+ SYNOPSIS
+ translog_write_variable_record_chunk2_page()
+ parts Descriptor of record source parts
+ horizon \ Pointers on file position and buffer
+ cursor /
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk2_page(struct st_translog_parts *parts,
+ TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ struct st_translog_buffer *buffer_to_flush;
+ int rc;
+ uchar chunk2_header[1];
+ DBUG_ENTER("translog_write_variable_record_chunk2_page");
+ chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+ LINT_INIT(buffer_to_flush);
+ rc= translog_page_next(horizon, cursor, &buffer_to_flush);
+ if (buffer_to_flush != NULL)
+ {
+ rc|= translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ }
+ if (rc)
+ DBUG_RETURN(1);
+
+ /* Puts chunk type */
+ translog_write_data_on_page(horizon, cursor, 1, chunk2_header);
+ /* Puts chunk body */
+ translog_write_parts_on_page(horizon, cursor,
+ log_descriptor.page_capacity_chunk_2, parts);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Put chunk 3 of requested length in the buffer from new page beginning
+
+ SYNOPSIS
+ translog_write_variable_record_chunk3_page()
+ parts Descriptor of record source parts
+ length Length of this chunk
+ horizon \ Pointers on file position and buffer
+ cursor /
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk3_page(struct st_translog_parts *parts,
+ uint16 length,
+ TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ struct st_translog_buffer *buffer_to_flush;
+ LEX_STRING *part;
+ int rc;
+ uchar chunk3_header[1 + 2];
+ DBUG_ENTER("translog_write_variable_record_chunk3_page");
+
+ LINT_INIT(buffer_to_flush);
+ rc= translog_page_next(horizon, cursor, &buffer_to_flush);
+ if (buffer_to_flush != NULL)
+ {
+ rc|= translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ }
+ if (rc)
+ DBUG_RETURN(1);
+ if (length == 0)
+ {
+ /* It was call to write page header only (no data for chunk 3) */
+ DBUG_PRINT("info", ("It is a call to make page header only"));
+ DBUG_RETURN(0);
+ }
+
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (part->length= 1 + 2);
+ part->str= (char*)chunk3_header;
+ /* Puts chunk type */
+ *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH);
+ /* Puts chunk length */
+ int2store(chunk3_header + 1, length);
+
+ translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts);
+ DBUG_RETURN(0);
+}
+
+/*
+ Move log pointer (horizon) on given number pages starting from next page,
+ and given offset on the last page
+
+ SYNOPSIS
+ translog_advance_pointer()
+ pages Number of full pages starting from the next one
+ last_page_data Plus this data on the last page
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_advance_pointer(uint pages, uint16 last_page_data)
+{
+ translog_size_t last_page_offset= (log_descriptor.page_overhead +
+ last_page_data);
+ translog_size_t offset= (TRANSLOG_PAGE_SIZE -
+ log_descriptor.bc.current_page_fill +
+ pages * TRANSLOG_PAGE_SIZE + last_page_offset);
+ translog_size_t buffer_end_offset, file_end_offset, min_offset;
+ DBUG_ENTER("translog_advance_pointer");
+ DBUG_PRINT("enter", ("Pointer: (%lu, 0x%lx) + %u + %u pages + %u + %u",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ (uint) (TRANSLOG_PAGE_SIZE -
+ log_descriptor.bc.current_page_fill),
+ pages, (uint) log_descriptor.page_overhead,
+ (uint) last_page_data));
+
+ for (;;)
+ {
+ uint8 new_buffer_no;
+ struct st_translog_buffer *new_buffer;
+ struct st_translog_buffer *old_buffer;
+ buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size;
+ file_end_offset= (log_descriptor.log_file_max_size -
+ LSN_OFFSET(log_descriptor.horizon));
+ DBUG_PRINT("info", ("offset: %lu buffer_end_offs: %lu, "
+ "file_end_offs: %lu",
+ (ulong) offset, (ulong) buffer_end_offset,
+ (ulong) file_end_offset));
+ DBUG_PRINT("info", ("Buff #%u %u (0x%lx) offset 0x%lx + size 0x%lx = "
+ "0x%lx (0x%lx)",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (uint) log_descriptor.bc.buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ (ulong) LSN_OFFSET(log_descriptor.bc.buffer->offset),
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+ log_descriptor.bc.buffer->size),
+ (ulong) LSN_OFFSET(log_descriptor.horizon)));
+ DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+ log_descriptor.bc.buffer->size ==
+ LSN_OFFSET(log_descriptor.horizon));
+
+ if (offset <= buffer_end_offset && offset <= file_end_offset)
+ break;
+ old_buffer= log_descriptor.bc.buffer;
+ new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ new_buffer= log_descriptor.buffers + new_buffer_no;
+
+ translog_buffer_lock(new_buffer);
+ translog_wait_for_buffer_free(new_buffer);
+
+ min_offset= min(buffer_end_offset, file_end_offset);
+ /* TODO: check is it ptr or size enough */
+ log_descriptor.bc.buffer->size+= min_offset;
+ log_descriptor.bc.ptr+= min_offset;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+ buffer->buffer)));
+ DBUG_ASSERT((ulong) (log_descriptor.bc.ptr -
+ log_descriptor.bc.buffer->buffer) ==
+ log_descriptor.bc.buffer->size);
+ DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ log_descriptor.bc.buffer_no);
+ translog_buffer_increase_writers(log_descriptor.bc.buffer);
+
+ if (file_end_offset <= buffer_end_offset)
+ {
+ log_descriptor.horizon+= LSN_ONE_FILE;
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ TRANSLOG_PAGE_SIZE);
+ DBUG_PRINT("info", ("New file: %lu",
+ (ulong) LSN_FILE_NO(log_descriptor.horizon)));
+ if (translog_create_new_file())
+ {
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ DBUG_PRINT("info", ("The same file"));
+ log_descriptor.horizon+= min_offset; /* offset increasing */
+ }
+ translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+ old_buffer->next_buffer_offset= new_buffer->offset;
+ if (translog_buffer_unlock(old_buffer))
+ DBUG_RETURN(1);
+ offset-= min_offset;
+ }
+ log_descriptor.bc.ptr+= offset;
+ log_descriptor.bc.buffer->size+= offset;
+ translog_buffer_increase_writers(log_descriptor.bc.buffer);
+ log_descriptor.horizon+= offset; /* offset increasing */
+ log_descriptor.bc.current_page_fill= last_page_offset;
+ DBUG_PRINT("info", ("drop write_counter"));
+ log_descriptor.bc.write_counter= 0;
+ log_descriptor.bc.previous_offset= 0;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) "
+ "offset: %u last page: %u",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr -
+ log_descriptor.bc.buffer->
+ buffer), (uint) offset,
+ (uint) last_page_offset));
+ DBUG_PRINT("info",
+ ("pointer moved to: (%lu, 0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+ DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc););
+ log_descriptor.bc.protected= 0;
+ DBUG_RETURN(0);
+}
+
+
+
+/*
+ Get page rest
+
+ SYNOPSIS
+ translog_get_current_page_rest()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ number of bytes left on the current page
+*/
+
+#define translog_get_current_page_rest() \
+ (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill)
+
+/*
+ Get buffer rest in full pages
+
+ SYNOPSIS
+ translog_get_current_buffer_rest()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ number of full pages left on the current buffer
+*/
+
+#define translog_get_current_buffer_rest() \
+ ((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER - \
+ log_descriptor.bc.ptr) / \
+ TRANSLOG_PAGE_SIZE)
+
+/*
+ Calculate possible group size without first (current) page
+
+ SYNOPSIS
+ translog_get_current_group_size()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ group size without first (current) page
+*/
+
+static translog_size_t translog_get_current_group_size()
+{
+ /* buffer rest in full pages */
+ translog_size_t buffer_rest= translog_get_current_buffer_rest();
+ DBUG_ENTER("translog_get_current_group_size");
+ DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest));
+
+ buffer_rest*= log_descriptor.page_capacity_chunk_2;
+ /* in case of only half of buffer free we can write this and next buffer */
+ if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2)
+ {
+ DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu",
+ (ulong) buffer_rest,
+ (ulong) log_descriptor.buffer_capacity_chunk_2));
+ buffer_rest+= log_descriptor.buffer_capacity_chunk_2;
+ }
+
+ DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest));
+
+ DBUG_RETURN(buffer_rest);
+}
+
+
+/*
+ Write variable record in 1 group
+
+ SYNOPSIS
+ translog_write_variable_record_1group()
+ lsn LSN of the record will be written here
+ type the log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ parts Descriptor of record source parts
+ buffer_to_flush Buffer which have to be flushed if it is not 0
+ header_length Calculated header length of chunk type 0
+ trn Transaction structure pointer for hooks by
+ record log type, for short_id
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_1group(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush, uint16 header_length,
+ TRN *trn)
+{
+ TRANSLOG_ADDRESS horizon;
+ struct st_buffer_cursor cursor;
+ int rc= 0;
+ uint i;
+ translog_size_t record_rest, full_pages, first_page;
+ uint additional_chunk3_page= 0;
+ uchar chunk0_header[1 + 2 + 5 + 2];
+ DBUG_ENTER("translog_write_variable_record_1group");
+
+ *lsn= horizon= log_descriptor.horizon;
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+ lsn, parts)))
+ {
+ translog_unlock();
+ DBUG_RETURN(1);
+ }
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+
+ /* Advance pointer To be able unlock the loghandler */
+ first_page= translog_get_current_page_rest();
+ record_rest= parts->record_length - (first_page - header_length);
+ full_pages= record_rest / log_descriptor.page_capacity_chunk_2;
+ record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+
+ if (record_rest + 1 == log_descriptor.page_capacity_chunk_2)
+ {
+ DBUG_PRINT("info", ("2 chunks type 3 is needed"));
+ /* We will write 2 chunks type 3 at the end of this group */
+ additional_chunk3_page= 1;
+ record_rest= 1;
+ }
+
+ DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) "
+ "additional: %u (%u) rest %u = %u",
+ first_page, first_page - header_length,
+ full_pages,
+ (ulong) full_pages *
+ log_descriptor.page_capacity_chunk_2,
+ additional_chunk3_page,
+ additional_chunk3_page *
+ (log_descriptor.page_capacity_chunk_2 - 1),
+ record_rest, parts->record_length));
+ /* record_rest + 3 is chunk type 3 overhead + record_rest */
+ rc|= translog_advance_pointer(full_pages + additional_chunk3_page,
+ (record_rest ? record_rest + 3 : 0));
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+
+ rc|= translog_unlock();
+
+ /*
+ Check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ }
+ if (rc)
+ DBUG_RETURN(1);
+
+ translog_write_variable_record_1group_header(parts, type, short_trid,
+ header_length, chunk0_header);
+
+ /* fill the pages */
+ translog_write_parts_on_page(&horizon, &cursor, first_page, parts);
+
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+
+ for (i= 0; i < full_pages; i++)
+ {
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ DBUG_RETURN(1);
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+ }
+
+ if (additional_chunk3_page)
+ {
+ if (translog_write_variable_record_chunk3_page(parts,
+ log_descriptor.
+ page_capacity_chunk_2 - 2,
+ &horizon, &cursor))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+ DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE);
+ }
+
+ if (translog_write_variable_record_chunk3_page(parts,
+ record_rest,
+ &horizon, &cursor))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ (ulong) LSN_FILE_NO(log_descriptor.horizon),
+ (ulong) LSN_OFFSET(log_descriptor.horizon),
+ (ulong) LSN_FILE_NO(horizon),
+ (ulong) LSN_OFFSET(horizon)));
+
+ if (!(rc= translog_buffer_lock(cursor.buffer)))
+ {
+ /*
+ Check if we wrote something on 1:st not full page and need to reconstruct
+ CRC and sector protection
+ */
+ translog_buffer_decrease_writers(cursor.buffer);
+ }
+ rc|= translog_buffer_unlock(cursor.buffer);
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ Write variable record in 1 chunk
+
+ SYNOPSIS
+ translog_write_variable_record_1chunk()
+ lsn LSN of the record will be written here
+ type the log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ parts Descriptor of record source parts
+ buffer_to_flush Buffer which have to be flushed if it is not 0
+ header_length Calculated header length of chunk type 0
+ trn Transaction structure pointer for hooks by
+ record log type, for short_id
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_1chunk(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush, uint16 header_length,
+ TRN *trn)
+{
+ int rc;
+ uchar chunk0_header[1 + 2 + 5 + 2];
+ DBUG_ENTER("translog_write_variable_record_1chunk");
+
+ translog_write_variable_record_1group_header(parts, type, short_trid,
+ header_length, chunk0_header);
+
+ *lsn= log_descriptor.horizon;
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+ lsn, parts)))
+ {
+ translog_unlock();
+ DBUG_RETURN(1);
+ }
+
+ rc= translog_write_parts_on_page(&log_descriptor.horizon,
+ &log_descriptor.bc,
+ parts->total_record_length, parts);
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+ rc|= translog_unlock();
+
+ /*
+ check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ Calculate and write LSN difference (compressed LSN)
+
+ SYNOPSIS
+ translog_put_LSN_diff()
+ base_lsn LSN from which we calculate difference
+ lsn LSN for codding
+ dst Result will be written to dst[-pack_length] .. dst[-1]
+
+ NOTE:
+ To store an LSN in a compact way we will use the following compression:
+
+ If a log record has LSN1, and it contains the lSN2 as a back reference,
+ Instead of LSN2 we write LSN1-LSN2, encoded as:
+
+ two bits the number N (see below)
+ 14 bits
+ N bytes
+
+ That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+ is stored in the first two bits.
+
+ RETURN
+ # pointer on coded LSN
+ NULL Error
+*/
+
+static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst)
+{
+ DBUG_ENTER("translog_put_LSN_diff");
+ DBUG_PRINT("enter", ("Base: (0x%lu,0x%lx) val: (0x%lu,0x%lx) dst: 0x%lx",
+ LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn),
+ (ulong) dst));
+ if (LSN_FILE_NO(base_lsn) == LSN_FILE_NO(lsn))
+ {
+ uint32 diff;
+ DBUG_ASSERT(base_lsn > lsn);
+ diff= base_lsn - lsn;
+ DBUG_PRINT("info", ("File is the same. Diff: 0x%lx", (ulong) diff));
+ if (diff <= 0x3FFF)
+ {
+ dst-= 2;
+ /*
+ Note we store this high uchar first to ensure that first uchar has
+ 0 in the 3 upper bits.
+ */
+ dst[0]= diff >> 8;
+ dst[1]= (diff & 0xFF);
+ }
+ else if (diff <= 0x3FFFFF)
+ {
+ dst-= 3;
+ dst[0]= 0x40 | (diff >> 16);
+ int2store(dst + 1, diff & 0xFFFF);
+ }
+ else if (diff <= 0x3FFFFFFF)
+ {
+ dst-= 4;
+ dst[0]= 0x80 | (diff >> 24);
+ int3store(dst + 1, diff & 0xFFFFFF);
+ }
+ else
+ {
+ dst-= 5;
+ dst[0]= 0xC0;
+ int4store(dst + 1, diff);
+ }
+ }
+ else
+ {
+ uint32 diff;
+ uint32 offset_diff;
+ ulonglong base_offset= LSN_OFFSET(base_lsn);
+ DBUG_ASSERT(base_lsn > lsn);
+ diff= LSN_FILE_NO(base_lsn) - LSN_FILE_NO(lsn);
+ DBUG_PRINT("info", ("File is different. Diff: 0x%lx", (ulong) diff));
+
+ if (base_offset < LSN_OFFSET(lsn))
+ {
+ /* take 1 from file offset */
+ diff--;
+ base_offset+= LL(0x100000000);
+ }
+ offset_diff= base_offset - LSN_OFFSET(lsn);
+ if (diff > 0x3f)
+ {
+ /*
+ It is full LSN after special 1 diff (which is impossible
+ in real life)
+ */
+ dst-= 2 + LSN_STORE_SIZE;
+ dst[0]= 0;
+ dst[1]= 1;
+ lsn_store(dst + 2, lsn);
+ }
+ else
+ {
+ dst-= 5;
+ *dst= (0xC0 | diff);
+ int4store(dst + 1, offset_diff);
+ }
+ }
+ DBUG_PRINT("info", ("new dst: 0x%lx", (ulong) dst));
+ DBUG_RETURN(dst);
+}
+
+
+/*
+ Get LSN from LSN-difference (compressed LSN)
+
+ SYNOPSIS
+ translog_get_LSN_from_diff()
+ base_lsn LSN from which we calculate difference
+ src pointer to coded lsn
+ dst pointer to buffer where to write 7byte LSN
+
+ NOTE:
+ To store an LSN in a compact way we will use the following compression:
+
+ If a log record has LSN1, and it contains the lSN2 as a back reference,
+ Instead of LSN2 we write LSN1-LSN2, encoded as:
+
+ two bits the number N (see below)
+ 14 bits
+ N bytes
+
+ That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+ is stored in the first two bits.
+
+ RETURN
+ pointer to buffer after decoded LSN
+*/
+
+static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst)
+{
+ LSN lsn;
+ uint32 diff;
+ uint32 first_byte;
+ uint32 file_no, rec_offset;
+ uint8 code;
+ DBUG_ENTER("translog_get_LSN_from_diff");
+ DBUG_PRINT("enter", ("Base: (0x%lx,0x%lx) src: 0x%lx dst 0x%lx",
+ LSN_IN_PARTS(base_lsn), (ulong) src, (ulong) dst));
+ first_byte= *((uint8*) src);
+ code= first_byte >> 6; /* Length is in 2 most significant bits */
+ first_byte&= 0x3F;
+ src++; /* Skip length + encode */
+ file_no= LSN_FILE_NO(base_lsn); /* Assume relative */
+ DBUG_PRINT("info", ("code: %u first byte: %lu",
+ (uint) code, (ulong) first_byte));
+ switch (code) {
+ case 0:
+ if (first_byte == 0 && *((uint8*)src) == 1)
+ {
+ /*
+ It is full LSN after special 1 diff (which is impossible
+ in real life)
+ */
+ memcpy(dst, src + 1, LSN_STORE_SIZE);
+ DBUG_PRINT("info", ("Special case of full LSN, new src: 0x%lx",
+ (ulong) (src + 1 + LSN_STORE_SIZE)));
+ DBUG_RETURN(src + 1 + LSN_STORE_SIZE);
+ }
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) + *((uint8*)src));
+ break;
+ case 1:
+ diff= uint2korr(src);
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) + diff);
+ break;
+ case 2:
+ diff= uint3korr(src);
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) + diff);
+ break;
+ case 3:
+ {
+ ulonglong base_offset= LSN_OFFSET(base_lsn);
+ diff= uint4korr(src);
+ if (diff > LSN_OFFSET(base_lsn))
+ {
+ /* take 1 from file offset */
+ first_byte++;
+ base_offset+= LL(0x100000000);
+ }
+ file_no= LSN_FILE_NO(base_lsn) - first_byte;
+ rec_offset= base_offset - diff;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(NULL);
+ }
+ lsn= MAKE_LSN(file_no, rec_offset);
+ src+= code + 1;
+ lsn_store(dst, lsn);
+ DBUG_PRINT("info", ("new src: 0x%lx", (ulong) src));
+ DBUG_RETURN(src);
+}
+
+
+/*
+ Encode relative LSNs listed in the parameters
+
+ SYNOPSIS
+ translog_relative_LSN_encode()
+ parts Parts list with encoded LSN(s)
+ base_lsn LSN which is base for encoding
+ lsns number of LSN(s) to encode
+ compressed_LSNs buffer which can be used for storing compressed LSN(s)
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_relative_LSN_encode(struct st_translog_parts *parts,
+ LSN base_lsn,
+ uint lsns, uchar *compressed_LSNs)
+{
+ LEX_STRING *part;
+ uint lsns_len= lsns * LSN_STORE_SIZE;
+ char buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE];
+ char *buffer= buffer_src;
+
+ DBUG_ENTER("translog_relative_LSN_encode");
+
+ DBUG_ASSERT(parts->current != 0);
+ part= parts->parts + parts->current;
+
+ /* collect all LSN(s) in one chunk if it (they) is (are) divided */
+ if (part->length < lsns_len)
+ {
+ uint copied= part->length;
+ LEX_STRING *next_part;
+ DBUG_PRINT("info", ("Using buffer: 0x%lx", (ulong) compressed_LSNs));
+ memcpy(buffer, (uchar*)part->str, part->length);
+ next_part= parts->parts + parts->current + 1;
+ do
+ {
+ DBUG_ASSERT(next_part < parts->parts + parts->elements);
+ if ((next_part->length + copied) < lsns_len)
+ {
+ memcpy(buffer + copied, (uchar*)next_part->str,
+ next_part->length);
+ copied+= next_part->length;
+ next_part->length= 0; next_part->str= 0;
+ /* delete_dynamic_element(&parts->parts, parts->current + 1); */
+ next_part++;
+ parts->current++;
+ part= parts->parts + parts->current;
+ }
+ else
+ {
+ uint len= lsns_len - copied;
+ memcpy(buffer + copied, (uchar*)next_part->str, len);
+ copied= lsns_len;
+ next_part->str+= len;
+ next_part->length-= len;
+ }
+ } while (copied < lsns_len);
+ }
+ else
+ {
+ buffer= part->str;
+ part->str+= lsns_len;
+ part->length-= lsns_len;
+ parts->current--;
+ part= parts->parts + parts->current;
+ }
+
+ {
+ /* Compress */
+ LSN ref;
+ int economy;
+ uchar *src_ptr;
+ uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE);
+ for (src_ptr= buffer + lsns_len - LSN_STORE_SIZE;
+ src_ptr >= (uchar*) buffer;
+ src_ptr-= LSN_STORE_SIZE)
+ {
+ ref= lsn_korr(src_ptr);
+ if ((dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr)) == NULL)
+ DBUG_RETURN(1);
+ }
+ part->length= (uint)((compressed_LSNs +
+ (MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE)) -
+ dst_ptr);
+ parts->record_length-= (economy= lsns_len - part->length);
+ DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d",
+ (ulong)part->length, economy));
+ parts->total_record_length-= economy;
+ part->str= (char*)dst_ptr;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write multi-group variable-size record
+
+ SYNOPSIS
+ translog_write_variable_record_mgroup()
+ lsn LSN of the record will be written here
+ type the log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ parts Descriptor of record source parts
+ buffer_to_flush Buffer which have to be flushed if it is not 0
+ header_length Header length calculated for 1 group
+ buffer_rest Beginning from which we plan to write in full pages
+ trn Transaction structure pointer for hooks by
+ record log type, for short_id
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_mgroup(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush,
+ uint16 header_length,
+ translog_size_t buffer_rest,
+ TRN *trn)
+{
+ TRANSLOG_ADDRESS horizon;
+ struct st_buffer_cursor cursor;
+ int rc= 0;
+ uint i, chunk2_page, full_pages;
+ uint curr_group= 0;
+ translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1;
+ translog_size_t done= 0;
+ struct st_translog_group_descriptor group;
+ DYNAMIC_ARRAY groups;
+ uint16 chunk3_size;
+ uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1;
+ uint16 last_page_capacity;
+ my_bool new_page_before_chunk0= 1, first_chunk0= 1;
+ uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1];
+ uchar chunk2_header[1];
+ uint header_fixed_part= header_length + 2;
+ uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1);
+ uint file_of_the_first_group;
+ DBUG_ENTER("translog_write_variable_record_mgroup");
+
+ chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+ if (init_dynamic_array(&groups, sizeof(struct st_translog_group_descriptor),
+ 10, 10 CALLER_INFO))
+ {
+ translog_unlock();
+ UNRECOVERABLE_ERROR(("init array failed"));
+ DBUG_RETURN(1);
+ }
+
+ first_page= translog_get_current_page_rest();
+ record_rest= parts->record_length - (first_page - 1);
+ DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest));
+
+ if (record_rest < buffer_rest)
+ {
+ DBUG_PRINT("info", ("too many free space because changing header"));
+ buffer_rest-= log_descriptor.page_capacity_chunk_2;
+ DBUG_ASSERT(record_rest >= buffer_rest);
+ }
+
+ file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon);
+ translog_mark_file_unfinished(file_of_the_first_group);
+ do
+ {
+ group.addr= horizon= log_descriptor.horizon;
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+ if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255)
+ {
+ /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */
+ full_pages= 255;
+ buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2;
+ }
+ /*
+ group chunks =
+ full pages + first page (which actually can be full, too).
+ But here we assign number of chunks - 1
+ */
+ group.num= full_pages;
+ if (insert_dynamic(&groups, (uchar*) &group))
+ {
+ UNRECOVERABLE_ERROR(("insert into array failed"));
+ goto err_unlock;
+ }
+
+ DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) "
+ "full_pages: %lu (%lu) "
+ "Left %lu",
+ groups.elements,
+ first_page, first_page - 1,
+ (ulong) full_pages,
+ (ulong) (full_pages *
+ log_descriptor.page_capacity_chunk_2),
+ (ulong)(parts->record_length - (first_page - 1 +
+ buffer_rest) -
+ done)));
+ rc|= translog_advance_pointer(full_pages, 0);
+
+ rc|= translog_unlock();
+
+ if (buffer_to_flush != NULL)
+ {
+ rc|= translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+ if (rc)
+ {
+ UNRECOVERABLE_ERROR(("flush of unlock buffer failed"));
+ goto err;
+ }
+
+ translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+ translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ done)));
+
+ for (i= 0; i < full_pages; i++)
+ {
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ goto err;
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) "
+ "local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ i * log_descriptor.page_capacity_chunk_2 -
+ done)));
+ }
+
+ done+= (first_page - 1 + buffer_rest);
+
+ /* TODO: make separate function for following */
+ rc= translog_page_next(&horizon, &cursor, &buffer_to_flush);
+ if (buffer_to_flush != NULL)
+ {
+ rc|= translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+ if (rc)
+ {
+ UNRECOVERABLE_ERROR(("flush of unlock buffer failed"));
+ goto err;
+ }
+ rc= translog_buffer_lock(cursor.buffer);
+ if (!rc)
+ translog_buffer_decrease_writers(cursor.buffer);
+ rc|= translog_buffer_unlock(cursor.buffer);
+ if (rc)
+ goto err;
+
+ translog_lock();
+
+ first_page= translog_get_current_page_rest();
+ buffer_rest= translog_get_current_group_size();
+ } while (first_page + buffer_rest < (uint) (parts->record_length - done));
+
+ group.addr= horizon= log_descriptor.horizon;
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+ group.num= 0; /* 0 because it does not matter */
+ if (insert_dynamic(&groups, (uchar*) &group))
+ {
+ UNRECOVERABLE_ERROR(("insert into array failed"));
+ goto err_unlock;
+ }
+ record_rest= parts->record_length - done;
+ DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest));
+ if (first_page <= record_rest + 1)
+ {
+ chunk2_page= 1;
+ record_rest-= (first_page - 1);
+ full_pages= record_rest / log_descriptor.page_capacity_chunk_2;
+ record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+ last_page_capacity= page_capacity;
+ }
+ else
+ {
+ chunk2_page= full_pages= 0;
+ last_page_capacity= first_page;
+ }
+ chunk3_size= 0;
+ chunk3_pages= 0;
+ if (last_page_capacity > record_rest + 1 && record_rest != 0)
+ {
+ if (last_page_capacity >
+ record_rest + header_fixed_part + groups.elements * (7 + 1))
+ {
+ /* 1 record of type 0 */
+ chunk3_pages= 0;
+ }
+ else
+ {
+ chunk3_pages= 1;
+ if (record_rest + 2 == last_page_capacity)
+ {
+ chunk3_size= record_rest - 1;
+ record_rest= 1;
+ }
+ else
+ {
+ chunk3_size= record_rest;
+ record_rest= 0;
+ }
+ }
+ }
+ /*
+ A first non-full page will hold type 0 chunk only if it fit in it with
+ all its headers
+ */
+ while (page_capacity <
+ record_rest + header_fixed_part +
+ (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1))
+ chunk0_pages++;
+ DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u "
+ "Group on last page: %u",
+ chunk0_pages, groups.elements,
+ groups_per_page,
+ (groups.elements -
+ ((page_capacity - header_fixed_part) / (7 + 1)) *
+ (chunk0_pages - 1))));
+ DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) "
+ "chunk3: %u (%u) rest: %u",
+ first_page,
+ chunk2_page, full_pages,
+ (ulong) full_pages *
+ log_descriptor.page_capacity_chunk_2,
+ chunk3_pages, (uint) chunk3_size, (uint) record_rest));
+ rc= translog_advance_pointer(full_pages + chunk3_pages +
+ (chunk0_pages - 1),
+ record_rest + header_fixed_part +
+ (groups.elements -
+ ((page_capacity -
+ header_fixed_part) / (7 + 1)) *
+ (chunk0_pages - 1)) * (7 + 1));
+ rc|= translog_unlock();
+ if (rc)
+ goto err;
+
+ if (chunk2_page)
+ {
+ DBUG_PRINT("info", ("chunk 2 to finish first page"));
+ translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+ translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ done)));
+ }
+ else if (chunk3_pages)
+ {
+ DBUG_PRINT("info", ("chunk 3"));
+ DBUG_ASSERT(full_pages == 0);
+ uchar chunk3_header[3];
+ chunk3_pages= 0;
+ chunk3_header[0]= TRANSLOG_CHUNK_LNGTH;
+ int2store(chunk3_header + 1, chunk3_size);
+ translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header);
+ translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - chunk3_size - done)));
+ }
+ else
+ {
+ DBUG_PRINT("info", ("no new_page_before_chunk0"));
+ new_page_before_chunk0= 0;
+ }
+
+ for (i= 0; i < full_pages; i++)
+ {
+ DBUG_ASSERT(chunk2_page != 0);
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ goto err;
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ i * log_descriptor.page_capacity_chunk_2 -
+ done)));
+ }
+
+ if (chunk3_pages &&
+ translog_write_variable_record_chunk3_page(parts,
+ chunk3_size,
+ &horizon, &cursor))
+ goto err;
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+
+ *chunk0_header= (uchar) (type |TRANSLOG_CHUNK_LSN);
+ int2store(chunk0_header + 1, short_trid);
+ translog_write_variable_record_1group_code_len(chunk0_header + 3,
+ parts->record_length,
+ header_length);
+ do
+ {
+ int limit;
+ if (new_page_before_chunk0)
+ {
+ rc= translog_page_next(&horizon, &cursor, &buffer_to_flush);
+ if (buffer_to_flush != NULL)
+ {
+ rc|= translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+ if (rc)
+ {
+ UNRECOVERABLE_ERROR(("flush of unlock buffer failed"));
+ goto err;
+ }
+ }
+ new_page_before_chunk0= 1;
+
+ if (first_chunk0)
+ {
+ first_chunk0= 0;
+ *lsn= horizon;
+ if (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook) (type, trn,
+ tbl_info,
+ lsn, parts))
+ goto err;
+ }
+
+ /*
+ A first non-full page will hold type 0 chunk only if it fit in it with
+ all its headers => the fist page is full or number of groups less then
+ possible number of full page.
+ */
+ limit= (groups_per_page < groups.elements - curr_group ?
+ groups_per_page : groups.elements - curr_group);
+ DBUG_PRINT("info", ("Groups: %u curr: %u limit: %u",
+ (uint) groups.elements, (uint) curr_group,
+ (uint) limit));
+
+ if (chunk0_pages == 1)
+ {
+ DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u",
+ (uint) limit, (uint) record_rest,
+ (uint) (2 + limit * (7 + 1) + record_rest)));
+ int2store(chunk0_header + header_length - 2,
+ 2 + limit * (7 + 1) + record_rest);
+ }
+ else
+ {
+ DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u",
+ (uint) limit, (uint) (2 + limit * (7 + 1))));
+ int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1));
+ }
+ int2store(chunk0_header + header_length, groups.elements - curr_group);
+ translog_write_data_on_page(&horizon, &cursor, header_fixed_part,
+ chunk0_header);
+ for (i= curr_group; i < limit + curr_group; i++)
+ {
+ struct st_translog_group_descriptor *grp_ptr;
+ grp_ptr= dynamic_element(&groups, i,
+ struct st_translog_group_descriptor *);
+ lsn_store(group_desc, grp_ptr->addr);
+ group_desc[7]= grp_ptr->num;
+ translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc);
+ }
+
+ if (chunk0_pages == 1 && record_rest != 0)
+ translog_write_parts_on_page(&horizon, &cursor, record_rest, parts);
+
+ chunk0_pages--;
+ curr_group+= limit;
+
+ } while (chunk0_pages != 0);
+ rc= translog_buffer_lock(cursor.buffer);
+ if (cmp_translog_addr(cursor.buffer->last_lsn, *lsn) < 0)
+ cursor.buffer->last_lsn= *lsn;
+ translog_buffer_decrease_writers(cursor.buffer);
+ rc|= translog_buffer_unlock(cursor.buffer);
+
+ if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn),
+ *lsn, FALSE))
+ goto err;
+ translog_mark_file_finished(file_of_the_first_group);
+
+
+ delete_dynamic(&groups);
+ DBUG_RETURN(rc);
+
+err_unlock:
+ translog_unlock();
+err:
+ delete_dynamic(&groups);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Write the variable length log record
+
+ SYNOPSIS
+ translog_write_variable_record()
+ lsn LSN of the record will be written here
+ type the log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ parts Descriptor of record source parts
+ trn Transaction structure pointer for hooks by
+ record log type, for short_id
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_variable_record(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ TRN *trn)
+{
+ struct st_translog_buffer *buffer_to_flush= NULL;
+ uint header_length1= 1 + 2 + 2 +
+ translog_variable_record_length_bytes(parts->record_length);
+ ulong buffer_rest;
+ uint page_rest;
+ /* Max number of such LSNs per record is 2 */
+ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE];
+ my_bool res;
+ DBUG_ENTER("translog_write_variable_record");
+
+ translog_lock();
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+ page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+ DBUG_PRINT("info", ("header length: %u page_rest: %u",
+ header_length1, page_rest));
+
+ /*
+ header and part which we should read have to fit in one chunk
+ TODO: allow to divide readable header
+ */
+ if (page_rest <
+ (header_length1 + log_record_type_descriptor[type].read_header_len))
+ {
+ DBUG_PRINT("info",
+ ("Next page, size: %u header: %u + %u",
+ log_descriptor.bc.current_page_fill,
+ header_length1,
+ log_record_type_descriptor[type].read_header_len));
+ translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+ &buffer_to_flush);
+ /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */
+ page_rest= log_descriptor.page_capacity_chunk_2 + 1;
+ DBUG_PRINT("info", ("page_rest: %u", page_rest));
+ }
+
+ /*
+ To minimize compressed size we will compress always relative to
+ very first chunk address (log_descriptor.horizon for now)
+ */
+ if (log_record_type_descriptor[type].compressed_LSN > 0)
+ {
+ if (translog_relative_LSN_encode(parts, log_descriptor.horizon,
+ log_record_type_descriptor[type].
+ compressed_LSN, compressed_LSNs))
+ {
+ translog_unlock();
+ if (buffer_to_flush != NULL)
+ {
+ /*
+ It is just try to finish log in nice way in case of error, so we
+ do not check result of the following functions, because we are
+ going return error state in any case
+ */
+ translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ }
+ DBUG_RETURN(1);
+ }
+ /* recalculate header length after compression */
+ header_length1= 1 + 2 + 2 +
+ translog_variable_record_length_bytes(parts->record_length);
+ DBUG_PRINT("info", ("after compressing LSN(s) header length: %u "
+ "record length: %lu",
+ header_length1, (ulong)parts->record_length));
+ }
+
+ /* TODO: check space on current page for header + few bytes */
+ if (page_rest >= parts->record_length + header_length1)
+ {
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_1chunk(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1, trn);
+ DBUG_RETURN(res);
+ }
+
+ buffer_rest= translog_get_current_group_size();
+
+ if (buffer_rest >= parts->record_length + header_length1 - page_rest)
+ {
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_1group(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1, trn);
+ DBUG_RETURN(res);
+ }
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_mgroup(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1,
+ buffer_rest, trn);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Write the fixed and pseudo-fixed log record
+
+ SYNOPSIS
+ translog_write_fixed_record()
+ lsn LSN of the record will be written here
+ type the log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ parts Descriptor of record source parts
+ trn Transaction structure pointer for hooks by
+ record log type, for short_id
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_fixed_record(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ TRN *trn)
+{
+ struct st_translog_buffer *buffer_to_flush= NULL;
+ uchar chunk1_header[1 + 2];
+ /* Max number of such LSNs per record is 2 */
+ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE];
+ LEX_STRING *part;
+ int rc;
+ DBUG_ENTER("translog_write_fixed_record");
+ DBUG_ASSERT((log_record_type_descriptor[type].class ==
+ LOGRECTYPE_FIXEDLENGTH &&
+ parts->record_length ==
+ log_record_type_descriptor[type].fixed_length) ||
+ (log_record_type_descriptor[type].class ==
+ LOGRECTYPE_PSEUDOFIXEDLENGTH &&
+ parts->record_length ==
+ log_record_type_descriptor[type].fixed_length));
+
+ translog_lock();
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+
+ DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_PRINT("info",
+ ("Page size: %u record: %u next cond: %d",
+ log_descriptor.bc.current_page_fill,
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3),
+ ((((uint) log_descriptor.bc.current_page_fill) +
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+ TRANSLOG_PAGE_SIZE)));
+ /*
+ check that there is enough place on current page.
+ NOTE: compressing may increase page LSN size on two bytes for every LSN
+ */
+ if ((((uint) log_descriptor.bc.current_page_fill) +
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+ TRANSLOG_PAGE_SIZE)
+ {
+ DBUG_PRINT("info", ("Next page"));
+ translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+ &buffer_to_flush);
+ }
+
+ *lsn= log_descriptor.horizon;
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook) (type, trn, tbl_info,
+ lsn, parts)))
+ {
+ rc= 1;
+ goto err;
+ }
+
+ /* compress LSNs */
+ if (log_record_type_descriptor[type].class == LOGRECTYPE_PSEUDOFIXEDLENGTH)
+ {
+ DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0);
+ if (translog_relative_LSN_encode(parts, *lsn,
+ log_record_type_descriptor[type].
+ compressed_LSN, compressed_LSNs))
+ {
+ rc= 1;
+ goto err;
+ }
+ }
+
+ /*
+ Write the whole record at once (we know that there is enough place on
+ the destination page)
+ */
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (part->length= 1 + 2);
+ part->str= (char*)chunk1_header;
+ *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED);
+ int2store(chunk1_header + 1, short_trid);
+
+ rc= translog_write_parts_on_page(&log_descriptor.horizon,
+ &log_descriptor.bc,
+ parts->total_record_length, parts);
+
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+
+err:
+ rc|= translog_unlock();
+
+ /*
+ check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ rc|= translog_buffer_unlock(buffer_to_flush);
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Writes the log record
+
+ If share has no 2-byte-id yet, gives an id to the share and logs
+ LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID
+ yet, logs it.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param tbl_info MARIA_HA of table or NULL
+ @param rec_len record length or 0 (count it)
+ @param part_no number of parts or 0 (count it)
+ @param parts_data zero ended (in case of number of parts is 0)
+ array of LEX_STRINGs (parts), first
+ TRANSLOG_INTERNAL_PARTS positions in the log
+ should be unused (need for loghandler)
+ @param store_share_id if tbl_info!=NULL then share's id will
+ automatically be stored in the two first bytes
+ pointed (so pointer is assumed to be !=NULL)
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_write_record(LSN *lsn,
+ enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ translog_size_t rec_len,
+ uint part_no,
+ LEX_STRING *parts_data,
+ uchar *store_share_id)
+{
+ struct st_translog_parts parts;
+ LEX_STRING *part;
+ int rc;
+ uint short_trid= trn->short_id;
+ DBUG_ENTER("translog_write_record");
+ DBUG_PRINT("enter", ("type: %u ShortTrID: %u rec_len: %lu",
+ (uint) type, (uint) short_trid, (ulong) rec_len));
+ DBUG_ASSERT(translog_inited == 1);
+
+ if (tbl_info)
+ {
+ MARIA_SHARE *share= tbl_info->s;
+ if (!share->now_transactional)
+ {
+ DBUG_PRINT("info", ("It is not transactional table"));
+ DBUG_RETURN(0);
+ }
+ if (unlikely(share->id == 0))
+ {
+ /*
+ First log write for this MARIA_SHARE; give it a short id.
+ When the lock manager is enabled and needs a short id, it should be
+ assigned in the lock manager (because row locks will be taken before
+ log records are written; for example SELECT FOR UPDATE takes locks but
+ writes no log record.
+ */
+ if (unlikely(translog_assign_id_to_share(tbl_info, trn)))
+ DBUG_RETURN(1);
+ }
+ fileid_store(store_share_id, share->id);
+ }
+ if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID)))
+ {
+ LSN dummy_lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[6];
+ int6store(log_data, trn->trid);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */
+ if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID,
+ trn, NULL, sizeof(log_data),
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL)))
+ DBUG_RETURN(1);
+ }
+
+ parts.parts= parts_data;
+
+ /* count parts if they are not counted by upper level */
+ if (part_no == 0)
+ {
+ for (part_no= TRANSLOG_INTERNAL_PARTS;
+ parts_data[part_no].length != 0;
+ part_no++);
+ }
+ parts.elements= part_no;
+ parts.current= TRANSLOG_INTERNAL_PARTS;
+
+ /* clear TRANSLOG_INTERNAL_PARTS */
+ DBUG_ASSERT(TRANSLOG_INTERNAL_PARTS != 0);
+ parts_data[0].str= 0;
+ parts_data[0].length= 0;
+
+ /* count length of the record */
+ if (rec_len == 0)
+ {
+ for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\
+ part < parts_data + part_no;
+ part++)
+ {
+ rec_len+= part->length;
+ }
+ }
+ parts.record_length= rec_len;
+
+#ifndef DBUG_OFF
+ {
+ uint i;
+ uint len= 0;
+#ifdef HAVE_purify
+ ha_checksum checksum= 0;
+#endif
+ for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++)
+ {
+#ifdef HAVE_purify
+ /* Find unitialized bytes early */
+ checksum+= my_checksum(checksum, parts_data[i].str,
+ parts_data[i].length);
+#endif
+ len+= parts_data[i].length;
+ }
+ DBUG_ASSERT(len == rec_len);
+ }
+#endif
+ /*
+ Start total_record_length from record_length then overhead will
+ be add
+ */
+ parts.total_record_length= parts.record_length;
+ DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length));
+
+ /* process this parts */
+ if (!(rc= (log_record_type_descriptor[type].prewrite_hook &&
+ (*log_record_type_descriptor[type].prewrite_hook) (type, trn,
+ tbl_info,
+ &parts))))
+ {
+ switch (log_record_type_descriptor[type].class) {
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ rc= translog_write_variable_record(lsn, type, tbl_info,
+ short_trid, &parts, trn);
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ case LOGRECTYPE_FIXEDLENGTH:
+ rc= translog_write_fixed_record(lsn, type, tbl_info,
+ short_trid, &parts, trn);
+ break;
+ case LOGRECTYPE_NOT_ALLOWED:
+ default:
+ DBUG_ASSERT(0);
+ rc= 1;
+ }
+ }
+
+ DBUG_PRINT("info", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(*lsn)));
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ Decode compressed (relative) LSN(s)
+
+ SYNOPSIS
+ translog_relative_lsn_decode()
+ base_lsn LSN for encoding
+ src Decode LSN(s) from here
+ dst Put decoded LSNs here
+ lsns number of LSN(s)
+
+ RETURN
+ position in sources after decoded LSN(s)
+*/
+
+static uchar *translog_relative_LSN_decode(LSN base_lsn,
+ uchar *src, uchar *dst, uint lsns)
+{
+ uint i;
+ for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE)
+ {
+ src= translog_get_LSN_from_diff(base_lsn, src, dst);
+ }
+ return src;
+}
+
+/**
+ @brief Get header of fixed/pseudo length record and call hook for
+ it processing
+
+ @param page Pointer to the buffer with page where LSN chunk is
+ placed
+ @param page_offset Offset of the first chunk in the page
+ @param buff Buffer to be filled with header data
+
+ @return Length of header or operation status
+ @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+static int translog_fixed_length_header(uchar *page,
+ translog_size_t page_offset,
+ TRANSLOG_HEADER_BUFFER *buff)
+{
+ struct st_log_record_type_descriptor *desc=
+ log_record_type_descriptor + buff->type;
+ uchar *src= page + page_offset + 3;
+ uchar *dst= buff->header;
+ uchar *start= src;
+ uint lsns= desc->compressed_LSN;
+ uint length= desc->fixed_length;
+
+ DBUG_ENTER("translog_fixed_length_header");
+
+ buff->record_length= length;
+
+ if (desc->class == LOGRECTYPE_PSEUDOFIXEDLENGTH)
+ {
+ DBUG_ASSERT(lsns > 0);
+ src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns);
+ lsns*= LSN_STORE_SIZE;
+ dst+= lsns;
+ length-= lsns;
+ buff->compressed_LSN_economy= (lsns - (src - start));
+ }
+ else
+ buff->compressed_LSN_economy= 0;
+
+ memcpy(dst, src, length);
+ buff->non_header_data_start_offset= page_offset +
+ ((src + length) - (page + page_offset));
+ buff->non_header_data_len= 0;
+ DBUG_RETURN(buff->record_length);
+}
+
+
+/*
+ Free resources used by TRANSLOG_HEADER_BUFFER
+
+ SYNOPSIS
+ translog_free_record_header();
+*/
+
+void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff)
+{
+ DBUG_ENTER("translog_free_record_header");
+ DBUG_ASSERT(translog_inited == 1);
+ if (buff->groups_no != 0)
+ {
+ my_free((uchar*) buff->groups, MYF(0));
+ buff->groups_no= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Returns the current horizon at the end of the current log
+
+ @return Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon()
+{
+ TRANSLOG_ADDRESS res;
+ DBUG_ASSERT(translog_inited == 1);
+ translog_lock();
+ res= log_descriptor.horizon;
+ translog_unlock();
+ return res;
+}
+
+
+/**
+ @brief Returns the current horizon at the end of the current log, caller is
+ assumed to already hold the lock
+
+ @return Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon_no_lock()
+{
+ DBUG_ASSERT(translog_inited == 1);
+ translog_lock_assert_owner();
+ return log_descriptor.horizon;
+}
+
+
+/*
+ Set last page in the scanner data structure
+
+ SYNOPSIS
+ translog_scanner_set_last_page()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA
+ *scanner)
+{
+ my_bool page_ok;
+ scanner->last_file_page= scanner->page_addr;
+ return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok));
+}
+
+
+/*
+ Initialize reader scanner
+
+ SYNOPSIS
+ translog_init_scanner()
+ lsn LSN with which it have to be inited
+ fixed_horizon true if it is OK do not read records which was written
+ after scanning beginning
+ scanner scanner which have to be inited
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+my_bool translog_init_scanner(LSN lsn,
+ my_bool fixed_horizon,
+ struct st_translog_scanner_data *scanner)
+{
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_init_scanner");
+ DBUG_PRINT("enter", ("LSN: (0x%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0);
+ DBUG_ASSERT(translog_inited == 1);
+
+ data.addr= &scanner->page_addr;
+ data.was_recovered= 0;
+
+ scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+
+ scanner->fixed_horizon= fixed_horizon;
+
+ scanner->horizon= translog_get_horizon();
+ DBUG_PRINT("info", ("horizon: (0x%lu,0x%lx)",
+ LSN_IN_PARTS(scanner->horizon)));
+
+ /* lsn < horizon */
+ DBUG_ASSERT(lsn < scanner->horizon);
+
+ scanner->page_addr= lsn;
+ scanner->page_addr-= scanner->page_offset; /*decrease offset */
+
+ if (translog_scanner_set_last_page(scanner))
+ DBUG_RETURN(1);
+
+ if ((scanner->page= translog_get_page(&data, scanner->buffer)) == NULL)
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Checks End of the Log
+
+ SYNOPSIS
+ translog_scanner_eol()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 1 End of the Log
+ 0 OK
+*/
+
+static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eol");
+ DBUG_PRINT("enter",
+ ("Horizon: (%lu, 0x%lx) Current: (%lu, 0x%lx+0x%x=0x%lx)",
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->page_addr),
+ (uint) scanner->page_offset,
+ (ulong) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset)));
+ if (scanner->horizon > (scanner->page_addr +
+ scanner->page_offset))
+ {
+ DBUG_PRINT("info", ("Horizon is not reached"));
+ DBUG_RETURN(0);
+ }
+ if (scanner->fixed_horizon)
+ {
+ DBUG_PRINT("info", ("Horizon is fixed and reached"));
+ DBUG_RETURN(1);
+ }
+ scanner->horizon= translog_get_horizon();
+ DBUG_PRINT("info",
+ ("Horizon is re-read, EOL: %d",
+ scanner->horizon <= (scanner->page_addr +
+ scanner->page_offset)));
+ DBUG_RETURN(scanner->horizon <= (scanner->page_addr +
+ scanner->page_offset));
+}
+
+
+/*
+ Cheks End of the Page
+
+ SYNOPSIS
+ translog_scanner_eop()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 1 End of the Page
+ 0 OK
+*/
+
+static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eop");
+ DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE ||
+ scanner->page[scanner->page_offset] == 0);
+}
+
+
+/*
+ Checks End of the File (I.e. we are scanning last page, which do not
+ mean end of this page)
+
+ SYNOPSIS
+ translog_scanner_eof()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 1 End of the File
+ 0 OK
+*/
+
+static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eof");
+ DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) ==
+ LSN_FILE_NO(scanner->last_file_page));
+ DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx "
+ "normal EOF: %d",
+ (ulong) LSN_OFFSET(scanner->page_addr),
+ (ulong) LSN_OFFSET(scanner->last_file_page),
+ LSN_OFFSET(scanner->page_addr) ==
+ LSN_OFFSET(scanner->last_file_page)));
+ /*
+ TODO: detect damaged file EOF,
+ TODO: issue warning if damaged file EOF detected
+ */
+ DBUG_RETURN(scanner->page_addr ==
+ scanner->last_file_page);
+}
+
+
+/*
+ Move scanner to the next chunk
+
+ SYNOPSIS
+ translog_get_next_chunk()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner)
+{
+ uint16 len;
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_get_next_chunk");
+
+ if ((len= translog_get_total_chunk_length(scanner->page,
+ scanner->page_offset)) == 0)
+ DBUG_RETURN(1);
+ scanner->page_offset+= len;
+
+ if (translog_scanner_eol(scanner))
+ {
+ scanner->page= &end_of_log;
+ scanner->page_offset= 0;
+ DBUG_RETURN(0);
+ }
+ if (translog_scanner_eop(scanner))
+ {
+ if (translog_scanner_eof(scanner))
+ {
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx) pageaddr: (%lu,0x%lx)",
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->page_addr)));
+ /* if it is log end it have to be caught before */
+ DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) >
+ LSN_FILE_NO(scanner->page_addr));
+ scanner->page_addr+= LSN_ONE_FILE;
+ scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr,
+ TRANSLOG_PAGE_SIZE);
+ if (translog_scanner_set_last_page(scanner))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */
+ }
+
+ data.addr= &scanner->page_addr;
+ data.was_recovered= 0;
+ if ((scanner->page= translog_get_page(&data, scanner->buffer)) == NULL)
+ DBUG_RETURN(1);
+
+ scanner->page_offset= translog_get_first_chunk_offset(scanner->page);
+ if (translog_scanner_eol(scanner))
+ {
+ scanner->page= &end_of_log;
+ scanner->page_offset= 0;
+ DBUG_RETURN(0);
+ }
+ DBUG_ASSERT(scanner->page[scanner->page_offset]);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Get header of variable length record and call hook for it processing
+
+ @param page Pointer to the buffer with page where LSN chunk is
+ placed
+ @param page_offset Offset of the first chunk in the page
+ @param buff Buffer to be filled with header data
+ @param scanner If present should be moved to the header page if
+ it differ from LSN page
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+static int
+translog_variable_length_header(uchar *page, translog_size_t page_offset,
+ TRANSLOG_HEADER_BUFFER *buff,
+ TRANSLOG_SCANNER_DATA *scanner)
+{
+ struct st_log_record_type_descriptor *desc= (log_record_type_descriptor +
+ buff->type);
+ uchar *src= page + page_offset + 1 + 2;
+ uchar *dst= buff->header;
+ LSN base_lsn;
+ uint lsns= desc->compressed_LSN;
+ uint16 chunk_len;
+ uint16 length= desc->read_header_len;
+ uint16 buffer_length= length;
+ uint16 body_len;
+ TRANSLOG_SCANNER_DATA internal_scanner;
+ DBUG_ENTER("translog_variable_length_header");
+
+ buff->record_length= translog_variable_record_1group_decode_len(&src);
+ chunk_len= uint2korr(src);
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u",
+ (ulong) buff->record_length, (uint) chunk_len,
+ (uint) length, (uint) buffer_length));
+ if (chunk_len == 0)
+ {
+ uint16 page_rest;
+ DBUG_PRINT("info", ("1 group"));
+ src+= 2;
+ page_rest= TRANSLOG_PAGE_SIZE - (src - page);
+
+ base_lsn= buff->lsn;
+ body_len= min(page_rest, buff->record_length);
+ }
+ else
+ {
+ uint grp_no, curr;
+ uint header_to_skip;
+ uint16 page_rest;
+
+ DBUG_PRINT("info", ("multi-group"));
+ grp_no= buff->groups_no= uint2korr(src + 2);
+ if (!(buff->groups=
+ (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no,
+ MYF(0))))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ DBUG_PRINT("info", ("Groups: %u", (uint) grp_no));
+ src+= (2 + 2);
+ page_rest= TRANSLOG_PAGE_SIZE - (src - page);
+ curr= 0;
+ header_to_skip= src - (page + page_offset);
+ buff->chunk0_pages= 0;
+
+ for (;;)
+ {
+ uint i, read= grp_no;
+
+ buff->chunk0_pages++;
+ if (page_rest < grp_no * (7 + 1))
+ read= page_rest / (7 + 1);
+ DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u "
+ "start from: %u",
+ buff->chunk0_pages, read, grp_no, curr));
+ for (i= 0; i < read; i++, curr++)
+ {
+ DBUG_ASSERT(curr < buff->groups_no);
+ buff->groups[curr].addr= lsn_korr(src + i * (7 + 1));
+ buff->groups[curr].num= src[i * (7 + 1) + 7];
+ DBUG_PRINT("info", ("group #%u (%lu,0x%lx) chunks: %u",
+ curr,
+ LSN_IN_PARTS(buff->groups[curr].addr),
+ (uint) buff->groups[curr].num));
+ }
+ grp_no-= read;
+ if (grp_no == 0)
+ {
+ if (scanner)
+ {
+ buff->chunk0_data_addr= scanner->page_addr;
+ buff->chunk0_data_addr+= (page_offset + header_to_skip +
+ read * (7 + 1)); /* offset increased */
+ }
+ else
+ {
+ buff->chunk0_data_addr= buff->lsn;
+ /* offset increased */
+ buff->chunk0_data_addr+= (header_to_skip + read * (7 + 1));
+ }
+ buff->chunk0_data_len= chunk_len - 2 - read * (7 + 1);
+ DBUG_PRINT("info", ("Data address: (%lu,0x%lx) len: %u",
+ LSN_IN_PARTS(buff->chunk0_data_addr),
+ buff->chunk0_data_len));
+ break;
+ }
+ if (scanner == NULL)
+ {
+ DBUG_PRINT("info", ("use internal scanner for header reading"));
+ scanner= &internal_scanner;
+ if (translog_init_scanner(buff->lsn, 1, scanner))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ }
+ if (translog_get_next_chunk(scanner))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ page= scanner->page;
+ page_offset= scanner->page_offset;
+ src= page + page_offset + header_to_skip;
+ chunk_len= uint2korr(src - 2 - 2);
+ DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len));
+ page_rest= TRANSLOG_PAGE_SIZE - (src - page);
+ }
+
+ if (scanner == NULL)
+ {
+ DBUG_PRINT("info", ("use internal scanner"));
+ scanner= &internal_scanner;
+ }
+
+ base_lsn= buff->groups[0].addr;
+ translog_init_scanner(base_lsn, 1, scanner);
+ /* first group chunk is always chunk type 2 */
+ page= scanner->page;
+ page_offset= scanner->page_offset;
+ src= page + page_offset + 1;
+ page_rest= TRANSLOG_PAGE_SIZE - (src - page);
+ body_len= page_rest;
+ }
+ if (lsns)
+ {
+ uchar *start= src;
+ src= translog_relative_LSN_decode(base_lsn, src, dst, lsns);
+ lsns*= LSN_STORE_SIZE;
+ dst+= lsns;
+ length-= lsns;
+ buff->record_length+= (buff->compressed_LSN_economy=
+ (lsns - (src - start)));
+ DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu",
+ lsns / LSN_STORE_SIZE, (uint) length,
+ (int) buff->compressed_LSN_economy,
+ (ulong) buff->record_length));
+ body_len-= (src - start);
+ }
+ else
+ buff->compressed_LSN_economy= 0;
+
+ DBUG_ASSERT(body_len >= length);
+ body_len-= length;
+ memcpy(dst, src, length);
+ buff->non_header_data_start_offset= src + length - page;
+ buff->non_header_data_len= body_len;
+ DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u",
+ buff->non_header_data_start_offset,
+ buff->non_header_data_len, buffer_length));
+ DBUG_RETURN(buffer_length);
+}
+
+
+/**
+ @brief Read record header from the given buffer
+
+ @param page page content buffer
+ @param page_offset offset of the chunk in the page
+ @param buff destination buffer
+ @param scanner If this is set the scanner will be moved to the
+ record header page (differ from LSN page in case of
+ multi-group records)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_record_header_from_buffer(uchar *page,
+ uint16 page_offset,
+ TRANSLOG_HEADER_BUFFER *buff,
+ TRANSLOG_SCANNER_DATA *scanner)
+{
+ translog_size_t res;
+ DBUG_ENTER("translog_read_record_header_from_buffer");
+ DBUG_ASSERT((page[page_offset] & TRANSLOG_CHUNK_TYPE) ==
+ TRANSLOG_CHUNK_LSN ||
+ (page[page_offset] & TRANSLOG_CHUNK_TYPE) ==
+ TRANSLOG_CHUNK_FIXED);
+ DBUG_ASSERT(translog_inited == 1);
+ buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
+ buff->short_trid= uint2korr(page + page_offset + 1);
+ DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
+ (uint) buff->type, (uint)buff->short_trid,
+ LSN_IN_PARTS(buff->lsn)));
+ /* Read required bytes from the header and call hook */
+ switch (log_record_type_descriptor[buff->type].class) {
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ res= translog_variable_length_header(page, page_offset, buff,
+ scanner);
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ case LOGRECTYPE_FIXEDLENGTH:
+ res= translog_fixed_length_header(page, page_offset, buff);
+ break;
+ default:
+ DBUG_ASSERT(0); /* we read some junk (got no LSN) */
+ res= RECHEADER_READ_ERROR;
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of a record (the part depend
+ on record type).
+
+ @param lsn log record serial number (address of the record)
+ @param buff log record header buffer
+
+ @note Some type of record can be read completely by this call
+ @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+ LSN can be translated to absolute one), some fields can be added (like
+ actual header length in the record if the header has variable length)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff)
+{
+ uchar buffer[TRANSLOG_PAGE_SIZE], *page;
+ translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+ TRANSLOG_ADDRESS addr;
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_read_record_header");
+ DBUG_PRINT("enter", ("LSN: (0x%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0);
+ DBUG_ASSERT(translog_inited == 1);
+
+ buff->lsn= lsn;
+ buff->groups_no= 0;
+ data.addr= &addr;
+ data.was_recovered= 0;
+ addr= lsn;
+ addr-= page_offset; /* offset decreasing */
+ res= (!(page= translog_get_page(&data, buffer))) ? RECHEADER_READ_ERROR :
+ translog_read_record_header_from_buffer(page, page_offset, buff, 0);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of a record (the part depend
+ on record type).
+
+ @param scan scanner position to read
+ @param buff log record header buffer
+ @param move_scanner request to move scanner to the header position
+
+ @note Some type of record can be read completely by this call
+ @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+ LSN can be translated to absolute one), some fields can be added (like
+ actual header length in the record if the header has variable length)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where stored
+ decoded part of the header
+*/
+
+int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff,
+ my_bool move_scanner)
+{
+ translog_size_t res;
+ DBUG_ENTER("translog_read_record_header_scan");
+ DBUG_PRINT("enter", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed %d",
+ LSN_IN_PARTS(scanner->page_addr),
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->last_file_page),
+ (uint) scanner->page_offset,
+ (uint) scanner->page_offset, scanner->fixed_horizon));
+ DBUG_ASSERT(translog_inited == 1);
+ buff->groups_no= 0;
+ buff->lsn= scanner->page_addr;
+ buff->lsn+= scanner->page_offset; /* offset increasing */
+ res= translog_read_record_header_from_buffer(scanner->page,
+ scanner->page_offset,
+ buff,
+ (move_scanner ?
+ scanner : 0));
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of the next record (the part
+ depend on record type).
+
+ @param scanner data for scanning if lsn is NULL scanner data
+ will be used for continue scanning.
+ The scanner can be NULL.
+
+ @param buff log record header buffer
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval RECHEADER_READ_EOF EOF
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff)
+{
+ uint8 chunk_type;
+ translog_size_t res;
+ buff->groups_no= 0; /* to be sure that we will free it right */
+
+ DBUG_ENTER("translog_read_next_record_header");
+ DBUG_PRINT("enter", ("scanner: 0x%lx", (ulong) scanner));
+ DBUG_PRINT("info", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d",
+ LSN_IN_PARTS(scanner->page_addr),
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->last_file_page),
+ (uint) scanner->page_offset,
+ (uint) scanner->page_offset, scanner->fixed_horizon));
+ DBUG_ASSERT(translog_inited == 1);
+
+ do
+ {
+ if (translog_get_next_chunk(scanner))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ chunk_type= scanner->page[scanner->page_offset] & TRANSLOG_CHUNK_TYPE;
+ DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type,
+ (uint) scanner->page[scanner->page_offset]));
+ } while (chunk_type != TRANSLOG_CHUNK_LSN && chunk_type !=
+ TRANSLOG_CHUNK_FIXED && scanner->page[scanner->page_offset] != 0);
+
+ if (scanner->page[scanner->page_offset] == 0)
+ {
+ /* Last record was read */
+ buff->lsn= LSN_IMPOSSIBLE;
+ /* Return 'end of log' marker */
+ res= RECHEADER_READ_EOF;
+ }
+ else
+ res= translog_read_record_header_scan(scanner, buff, 0);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Moves record data reader to the next chunk and fill the data reader
+ information about that chunk.
+
+ SYNOPSIS
+ translog_record_read_next_chunk()
+ data data cursor
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_record_read_next_chunk(struct st_translog_reader_data
+ *data)
+{
+ translog_size_t new_current_offset= data->current_offset + data->chunk_size;
+ uint16 chunk_header_len, chunk_len;
+ uint8 type;
+ DBUG_ENTER("translog_record_read_next_chunk");
+
+ if (data->eor)
+ {
+ DBUG_PRINT("info", ("end of the record flag set"));
+ DBUG_RETURN(1);
+ }
+
+ if (data->header.groups_no &&
+ data->header.groups_no - 1 != data->current_group &&
+ data->header.groups[data->current_group].num == data->current_chunk)
+ {
+ /* Goto next group */
+ data->current_group++;
+ data->current_chunk= 0;
+ DBUG_PRINT("info", ("skip to group: #%u", data->current_group));
+ translog_init_scanner(data->header.groups[data->current_group].addr,
+ 1, &data->scanner);
+ }
+ else
+ {
+ data->current_chunk++;
+ if (translog_get_next_chunk(&data->scanner))
+ DBUG_RETURN(1);
+ }
+ type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE;
+
+ if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no)
+ {
+ DBUG_PRINT("info",
+ ("Last chunk: data len: %u offset: %u group: %u of %u",
+ data->header.chunk0_data_len, data->scanner.page_offset,
+ data->current_group, data->header.groups_no - 1));
+ DBUG_ASSERT(data->header.groups_no - 1 == data->current_group);
+ DBUG_ASSERT(data->header.lsn ==
+ data->scanner.page_addr + data->scanner.page_offset);
+ translog_init_scanner(data->header.chunk0_data_addr, 1, &data->scanner);
+ data->chunk_size= data->header.chunk0_data_len;
+ data->body_offset= data->scanner.page_offset;
+ data->current_offset= new_current_offset;
+ data->eor= 1;
+ DBUG_RETURN(0);
+ }
+
+ if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED)
+ {
+ data->eor= 1;
+ DBUG_RETURN(1); /* End of record */
+ }
+
+ chunk_header_len=
+ translog_get_chunk_header_length(data->scanner.page,
+ data->scanner.page_offset);
+ chunk_len= translog_get_total_chunk_length(data->scanner.page,
+ data->scanner.page_offset);
+ data->chunk_size= chunk_len - chunk_header_len;
+ data->body_offset= data->scanner.page_offset + chunk_header_len;
+ data->current_offset= new_current_offset;
+ DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u "
+ "current_offset: %lu",
+ (uint) data->current_group,
+ (uint) data->current_chunk,
+ (uint) data->body_offset,
+ (uint) data->chunk_size, (ulong) data->current_offset));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Initialize record reader data from LSN
+
+ SYNOPSIS
+ translog_init_reader_data()
+ lsn reference to LSN we should start from
+ data reader data to initialize
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_init_reader_data(LSN lsn,
+ struct st_translog_reader_data *data)
+{
+ int read_header;
+ DBUG_ENTER("translog_init_reader_data");
+ if (translog_init_scanner(lsn, 1, &data->scanner) ||
+ ((read_header=
+ translog_read_record_header_scan(&data->scanner, &data->header, 1))
+ == RECHEADER_READ_ERROR))
+ DBUG_RETURN(1);
+ data->read_header= read_header;
+ data->body_offset= data->header.non_header_data_start_offset;
+ data->chunk_size= data->header.non_header_data_len;
+ data->current_offset= data->read_header;
+ data->current_group= 0;
+ data->current_chunk= 0;
+ data->eor= 0;
+ DBUG_PRINT("info", ("read_header: %u "
+ "body_offset: %u chunk_size: %u current_offset: %lu",
+ (uint) data->read_header,
+ (uint) data->body_offset,
+ (uint) data->chunk_size, (ulong) data->current_offset));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read a part of the record.
+
+ SYNOPSIS
+ translog_read_record_header()
+ lsn log record serial number (address of the record)
+ offset From the beginning of the record beginning (read§
+ by translog_read_record_header).
+ length Length of record part which have to be read.
+ buffer Buffer where to read the record part (have to be at
+ least 'length' bytes length)
+
+ RETURN
+ length of data actually read
+*/
+
+translog_size_t translog_read_record(LSN lsn,
+ translog_size_t offset,
+ translog_size_t length,
+ uchar *buffer,
+ struct st_translog_reader_data *data)
+{
+ translog_size_t requested_length= length;
+ translog_size_t end= offset + length;
+ struct st_translog_reader_data internal_data;
+ DBUG_ENTER("translog_read_record");
+ DBUG_ASSERT(translog_inited == 1);
+
+ if (data == NULL)
+ {
+ DBUG_ASSERT(lsn != LSN_IMPOSSIBLE);
+ data= &internal_data;
+ }
+ if (lsn ||
+ (offset < data->current_offset &&
+ !(offset < data->read_header && offset + length < data->read_header)))
+ {
+ if (translog_init_reader_data(lsn, data))
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("info", ("Offset: %lu length: %lu "
+ "Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d",
+ (ulong) offset, (ulong) length,
+ LSN_IN_PARTS(data->scanner.page_addr),
+ LSN_IN_PARTS(data->scanner.horizon),
+ LSN_IN_PARTS(data->scanner.last_file_page),
+ (uint) data->scanner.page_offset,
+ (uint) data->scanner.page_offset,
+ data->scanner.fixed_horizon));
+ if (offset < data->read_header)
+ {
+ uint16 len= min(data->read_header, end) - offset;
+ DBUG_PRINT("info",
+ ("enter header offset: %lu length: %lu",
+ (ulong) offset, (ulong) length));
+ memcpy(buffer, data->header.header + offset, len);
+ length-= len;
+ if (length == 0)
+ DBUG_RETURN(requested_length);
+ offset+= len;
+ buffer+= len;
+ DBUG_PRINT("info",
+ ("len: %u offset: %lu curr: %lu length: %lu",
+ len, (ulong) offset, (ulong) data->current_offset,
+ (ulong) length));
+ }
+ /* TODO: find first page which we should read by offset */
+
+ /* read the record chunk by chunk */
+ for(;;)
+ {
+ uint page_end= data->current_offset + data->chunk_size;
+ DBUG_PRINT("info",
+ ("enter body offset: %lu curr: %lu "
+ "length: %lu page_end: %lu",
+ (ulong) offset, (ulong) data->current_offset, (ulong) length,
+ (ulong) page_end));
+ if (offset < page_end)
+ {
+ uint len= page_end - offset;
+ DBUG_ASSERT(offset >= data->current_offset);
+ memcpy(buffer,
+ data->scanner.page + data->body_offset +
+ (offset - data->current_offset), len);
+ length-= len;
+ if (length == 0)
+ DBUG_RETURN(requested_length);
+ offset+= len;
+ buffer+= len;
+ DBUG_PRINT("info",
+ ("len: %u offset: %lu curr: %lu length: %lu",
+ len, (ulong) offset, (ulong) data->current_offset,
+ (ulong) length));
+ }
+ if (translog_record_read_next_chunk(data))
+ DBUG_RETURN(requested_length - length);
+ }
+}
+
+
+/*
+ Force skipping to the next buffer
+
+ SYNOPSIS
+ translog_force_current_buffer_to_finish()
+*/
+
+static void translog_force_current_buffer_to_finish()
+{
+ TRANSLOG_ADDRESS new_buff_beginning;
+ uint16 old_buffer_no= log_descriptor.bc.buffer_no;
+ uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ struct st_translog_buffer *new_buffer= (log_descriptor.buffers +
+ new_buffer_no);
+ struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer;
+ uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill;
+ uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+ uint16 current_page_fill, write_counter, previous_offset;
+ DBUG_ENTER("translog_force_current_buffer_to_finish");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx "
+ "Buffer addr: (%lu,0x%lx) "
+ "Page addr: (%lu,0x%lx) "
+ "size: %lu (%lu) Pg: %u left: %u",
+ (uint) log_descriptor.bc.buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (ulong) LSN_FILE_NO(log_descriptor.horizon),
+ (ulong) (LSN_OFFSET(log_descriptor.horizon) -
+ log_descriptor.bc.current_page_fill),
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+ buffer->buffer),
+ (uint) log_descriptor.bc.current_page_fill,
+ (uint) left));
+
+ LINT_INIT(current_page_fill);
+ new_buff_beginning= log_descriptor.bc.buffer->offset;
+ new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+
+ DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
+ DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
+ LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc););
+ DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+ if (left != 0)
+ {
+ /*
+ TODO: if 'left' is so small that can't hold any other record
+ then do not move the page
+ */
+ DBUG_PRINT("info", ("left: %u", (uint) left));
+
+ /* decrease offset */
+ new_buff_beginning-= log_descriptor.bc.current_page_fill;
+ current_page_fill= log_descriptor.bc.current_page_fill;
+
+ bzero(log_descriptor.bc.ptr, left);
+ log_descriptor.bc.buffer->size+= left;
+ DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
+ "Size: %lu",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ (ulong) log_descriptor.bc.buffer->size));
+ DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ log_descriptor.bc.buffer_no);
+ }
+ else
+ {
+ log_descriptor.bc.current_page_fill= 0;
+ }
+
+ translog_buffer_lock(new_buffer);
+ translog_wait_for_buffer_free(new_buffer);
+
+ write_counter= log_descriptor.bc.write_counter;
+ previous_offset= log_descriptor.bc.previous_offset;
+ translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+ log_descriptor.bc.buffer->offset= new_buff_beginning;
+ log_descriptor.bc.write_counter= write_counter;
+ log_descriptor.bc.previous_offset= previous_offset;
+
+ if (data[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+ {
+ translog_put_sector_protection(data, &log_descriptor.bc);
+ if (left)
+ {
+ log_descriptor.bc.write_counter++;
+ log_descriptor.bc.previous_offset= current_page_fill;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("drop write_counter"));
+ log_descriptor.bc.write_counter= 0;
+ log_descriptor.bc.previous_offset= 0;
+ }
+ }
+
+ if (data[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(data + log_descriptor.page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ log_descriptor.page_overhead);
+ DBUG_PRINT("info", ("CRC: 0x%lx", (ulong) crc));
+ int4store(data + 3 + 3 + 1, crc);
+ }
+
+ if (left)
+ {
+ /*
+ TODO: do not copy begining of the page if we have no CRC or sector
+ checks on
+ */
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ log_descriptor.bc.ptr+= current_page_fill;
+ log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill=
+ current_page_fill;
+ new_buffer->overlay= old_buffer;
+ }
+ else
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ old_buffer->next_buffer_offset= new_buffer->offset;
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @todo LOG: when a log write fails, we should not write to this log anymore
+ (if we add more log records to this log they will be unreadable: we will hit
+ the broken log record): all translog_flush() should be made to fail (because
+ translog_flush() is when a a transaction wants something durable and we
+ cannot make anything durable as log is corrupted). For that, a "my_bool
+ st_translog_descriptor::write_error" could be set to 1 when a
+ translog_write_record() or translog_flush() fails, and translog_flush()
+ would test this var (and translog_write_record() could also test this var if
+ it wants, though it's not absolutely needed).
+ Then, either shut Maria down immediately, or switch to a new log (but if we
+ get write error after write error, that would create too many logs).
+ A popular open-source transactional engine intentionally crashes as soon as
+ a log flush fails (we however don't want to crash the entire mysqld, but
+ stopping all engine's operations immediately would make sense).
+ Same applies to translog_write_record().
+
+ @todo: remove serialization and make group commit.
+*/
+
+my_bool translog_flush(LSN lsn)
+{
+ LSN old_flushed, sent_to_file;
+ int rc= 0;
+ uint i;
+ my_bool full_circle= 0;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_inited == 1);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ translog_lock();
+ old_flushed= log_descriptor.flushed;
+ for (;;)
+ {
+ uint16 buffer_no= log_descriptor.bc.buffer_no;
+ uint16 buffer_start= buffer_no;
+ struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer;
+ struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
+ /* we can't flush in future */
+ DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, lsn) >= 0);
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ DBUG_PRINT("info", ("already flushed: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ goto out;
+ }
+ /* send to the file if it is not sent */
+ sent_to_file= translog_get_sent_to_file();
+ if (cmp_translog_addr(sent_to_file, lsn) >= 0)
+ break;
+
+ do
+ {
+ buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ buffer= log_descriptor.buffers + buffer_no;
+ translog_buffer_lock(buffer);
+ translog_buffer_unlock(buffer_unlock);
+ buffer_unlock= buffer;
+ if (buffer->file != -1)
+ {
+ buffer_unlock= NULL;
+ if (buffer_start == buffer_no)
+ {
+ /* we made a circle */
+ full_circle= 1;
+ translog_force_current_buffer_to_finish();
+ }
+ break;
+ }
+ } while ((buffer_start != buffer_no) &&
+ cmp_translog_addr(log_descriptor.flushed, lsn) < 0);
+ if (buffer_unlock != NULL && buffer_unlock != buffer)
+ translog_buffer_unlock(buffer_unlock);
+ rc= translog_buffer_flush(buffer);
+ translog_buffer_unlock(buffer);
+ if (rc)
+ {
+ rc= 1;
+ goto out;
+ }
+ if (!full_circle)
+ translog_lock();
+ }
+
+ for (i= LSN_FILE_NO(old_flushed); i <= LSN_FILE_NO(lsn); i++)
+ {
+ uint cache_index;
+ File file;
+
+ if ((cache_index= LSN_FILE_NO(log_descriptor.horizon) - i) <
+ OPENED_FILES_NUM)
+ {
+ /* file in the cache */
+ if (log_descriptor.log_file_num[cache_index] == -1)
+ {
+ if ((log_descriptor.log_file_num[cache_index]=
+ open_logfile_by_number_no_cache(i)) == -1)
+ {
+ rc= 1;
+ goto out;
+ }
+ }
+ file= log_descriptor.log_file_num[cache_index];
+ rc|= my_sync(file, MYF(MY_WME));
+ }
+ /* We sync file when we are closing it => do nothing if file closed */
+ }
+ log_descriptor.flushed= sent_to_file;
+ /** @todo LOG decide if syncing of directory is needed */
+ rc|= my_sync(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+out:
+ translog_unlock();
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Sets transaction's rec_lsn if needed
+
+ A transaction sometimes writes a REDO even before the page is in the
+ pagecache (example: brand new head or tail pages; full pages). So, if
+ Checkpoint happens just after the REDO write, it needs to know that the
+ REDO phase must start before this REDO. Scanning the pagecache cannot
+ tell that as the page is not in the cache. So, transaction sets its rec_lsn
+ to the REDO's LSN or somewhere before, and Checkpoint reads the
+ transaction's rec_lsn.
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_redo(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn,
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ /*
+ Users of dummy_transaction_object must keep this TRN clean as it
+ is used by many threads (like those manipulating non-transactional
+ tables). It might be dangerous if one user sets rec_lsn or some other
+ member and it is picked up by another user (like putting this rec_lsn into
+ a page of a non-transactional table); it's safer if all members stay 0. So
+ non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
+ call this hook; we trust them but verify ;)
+ */
+ DBUG_ASSERT(trn->trid != 0);
+ /*
+ If the hook stays so simple, it would be faster to pass
+ !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
+ to translog_write_record(), like Monty did in his original code, and not
+ have a hook. For now we keep it like this.
+ */
+ if (trn->rec_lsn == 0)
+ trn->rec_lsn= *lsn;
+ return 0;
+}
+
+
+/**
+ @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_undo(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn,
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ DBUG_ASSERT(trn->trid != 0);
+ trn->undo_lsn= *lsn;
+ if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
+ trn->first_undo_lsn=
+ trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+ return 0;
+ /*
+ when we implement purging, we will specialize this hook: UNDO_PURGE
+ records will additionally set trn->undo_purge_lsn
+ */
+}
+
+
+/**
+ @brief Sets the table's records count to 0, then calls the generic REDO
+ hook.
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_redo_delete_all(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn,
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ tbl_info->s->state.state.records= 0;
+ return write_hook_for_redo(type, trn, tbl_info, lsn, parts);
+}
+
+
+/**
+ @brief Upates "records" and calls the generic UNDO hook
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_undo_row_insert(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ tbl_info->s->state.state.records++;
+ return write_hook_for_undo(type, trn, tbl_info, lsn, parts);
+}
+
+
+/**
+ @brief Upates "records" and calls the generic UNDO hook
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_undo_row_delete(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ tbl_info->s->state.state.records--;
+ return write_hook_for_undo(type, trn, tbl_info, lsn, parts);
+}
+
+
+/**
+ @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_clr_end(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn
+ __attribute__ ((unused)),
+ struct st_translog_parts *parts)
+{
+ char *ptr= parts->parts[TRANSLOG_INTERNAL_PARTS + 0].str;
+ enum translog_record_type undone_record_type=
+ ptr[LSN_STORE_SIZE + FILEID_STORE_SIZE];
+
+ DBUG_ASSERT(trn->trid != 0);
+ trn->undo_lsn= lsn_korr(ptr);
+ switch (undone_record_type) {
+ case LOGREC_UNDO_ROW_DELETE:
+ tbl_info->s->state.state.records++;
+ break;
+ case LOGREC_UNDO_ROW_INSERT:
+ tbl_info->s->state.state.records--;
+ break;
+ case LOGREC_UNDO_ROW_UPDATE:
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
+ trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+ return 0;
+}
+
+
+/**
+ @brief Updates table's lsn_of_file_id.
+
+ @todo move it to a separate file
+
+ @return Operation status, always 0 (success)
+*/
+
+static my_bool write_hook_for_file_id(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn
+ __attribute__ ((unused)),
+ MARIA_HA *tbl_info,
+ LSN *lsn
+ __attribute__ ((unused)),
+ struct st_translog_parts *parts
+ __attribute__ ((unused)))
+{
+ DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
+ tbl_info->s->lsn_of_file_id= *lsn;
+ return 0;
+}
+
+
+/**
+ @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact
+
+ If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently
+ open MARIA_SHAREs), give it one and record this assignment in the log
+ (LOGREC_FILE_ID log record).
+
+ @param tbl_info table
+ @param trn calling transaction
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note Can be called even if share already has an id (then will do nothing)
+*/
+
+int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ /*
+ If you give an id to a non-BLOCK_RECORD table, you also need to release
+ this id somewhere. Then you can change the assertion.
+ */
+ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+ /* re-check under mutex to avoid having 2 ids for the same share */
+ pthread_mutex_lock(&share->intern_lock);
+ if (likely(share->id == 0))
+ {
+ /* Inspired by set_short_trid() of trnman.c */
+ uint i= share->kfile.file % SHARE_ID_MAX + 1;
+ do
+ {
+ my_atomic_rwlock_wrlock(&LOCK_id_to_share);
+ for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */
+ {
+ void *tmp= NULL;
+ if (id_to_share[i] == NULL &&
+ my_atomic_casptr((void **)&id_to_share[i], &tmp, share))
+ {
+ share->id= (uint16)i;
+ break;
+ }
+ }
+ my_atomic_rwlock_wrunlock(&LOCK_id_to_share);
+ i= 1; /* scan the whole array */
+ } while (share->id == 0);
+ DBUG_PRINT("info", ("id_to_share: 0x%lx -> %u", (ulong)share, share->id));
+ LSN lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ uchar log_data[FILEID_STORE_SIZE];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ /*
+ open_file_name is an unresolved name (symlinks are not resolved, datadir
+ is not realpath-ed, etc) which is good: the log can be moved to another
+ directory and continue working.
+ */
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= share->open_file_name;
+ /**
+ @todo if we had the name's length in MARIA_SHARE we could avoid this
+ strlen()
+ */
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length=
+ strlen(share->open_file_name) + 1;
+ if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info,
+ sizeof(log_data) +
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 1].length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data)))
+ return 1;
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ return 0;
+}
+
+
+/**
+ @brief Recycles a MARIA_SHARE's short id.
+
+ @param share table
+
+ @note Must be called only if share has an id (i.e. id != 0)
+*/
+
+void translog_deassign_id_from_share(MARIA_SHARE *share)
+{
+ DBUG_PRINT("info", ("id_to_share: 0x%lx id %u -> 0",
+ (ulong)share, share->id));
+ /*
+ We don't need any mutex as we are called only when closing the last
+ instance of the table or at the end of REPAIR: no writes can be
+ happening. But a Checkpoint may be reading share->id, so we require this
+ mutex:
+ */
+ safe_mutex_assert_owner(&share->intern_lock);
+ my_atomic_rwlock_rdlock(&LOCK_id_to_share);
+ my_atomic_storeptr((void **)&id_to_share[share->id], 0);
+ my_atomic_rwlock_rdunlock(&LOCK_id_to_share);
+ share->id= 0;
+ /* useless but safety: */
+ share->lsn_of_file_id= LSN_IMPOSSIBLE;
+}
+
+
+void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share,
+ uint16 id)
+{
+ DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+ DBUG_ASSERT(share->id == 0);
+ DBUG_ASSERT(id_to_share[id] == NULL);
+ id_to_share[share->id= id]= share;
+}
+
+
+/**
+ @brief check if such log file exists
+
+ @param file_no number of the file to test
+
+ @retval 0 no such file
+ @retval 1 there is file with such number
+*/
+
+my_bool translog_is_file(uint file_no)
+{
+ MY_STAT stat_buff;
+ char path[FN_REFLEN];
+ return (test(my_stat(translog_filename_by_fileno(file_no, path),
+ &stat_buff, MYF(0))));
+}
+
+
+/**
+ @brief returns minimum log file number
+
+ @param horizon the end of the log
+ @param is_protected true if it is under purge_log protection
+
+ @retval minimum file number
+ @retval 0 no files found
+*/
+
+static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected)
+{
+ uint min_file= 1, max_file;
+ DBUG_ENTER("translog_first_file");
+ if (!is_protected)
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+ if (log_descriptor.min_file_number &&
+ translog_is_file(log_descriptor.min_file_number))
+ {
+ DBUG_PRINT("info", ("cached %lu",
+ (ulong) log_descriptor.min_file_number));
+ if (!is_protected)
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(log_descriptor.min_file_number);
+ }
+
+ max_file= LSN_FILE_NO(horizon);
+
+ if (MAKE_LSN(1, TRANSLOG_PAGE_SIZE) >= horizon)
+ {
+ /* there is no first page yet */
+ DBUG_RETURN(0);
+ }
+
+ /* binary search for last file */
+ while (min_file != max_file && min_file != (max_file - 1))
+ {
+ uint test= (min_file + max_file) / 2;
+ DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u",
+ min_file, test, max_file));
+ if (test == max_file)
+ test--;
+ if (translog_is_file(test))
+ max_file= test;
+ else
+ min_file= test;
+ }
+ log_descriptor.min_file_number= max_file;
+ if (!is_protected)
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(max_file);
+}
+
+
+/**
+ @brief returns the most close LSN higher the given chunk address
+
+ @param addr the chunk address to start from
+ @param horizon the horizon if it is known or LSN_IMPOSSIBLE
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no LSNs after the address
+ @retval # LSN of the most close LSN higher the given chunk address
+*/
+
+LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon)
+{
+ uint chunk_type;
+ TRANSLOG_SCANNER_DATA scanner;
+ DBUG_ENTER("translog_next_LSN");
+
+ if (horizon == LSN_IMPOSSIBLE)
+ horizon= translog_get_horizon();
+
+ if (addr == horizon)
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+
+ translog_init_scanner(addr, 0, &scanner);
+
+ chunk_type= scanner.page[scanner.page_offset] & TRANSLOG_CHUNK_TYPE;
+ DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type,
+ (uint) scanner.page[scanner.page_offset]));
+ while (chunk_type != TRANSLOG_CHUNK_LSN &&
+ chunk_type != TRANSLOG_CHUNK_FIXED &&
+ scanner.page[scanner.page_offset] != 0)
+ {
+ if (translog_get_next_chunk(&scanner))
+ DBUG_RETURN(LSN_ERROR);
+ chunk_type= scanner.page[scanner.page_offset] & TRANSLOG_CHUNK_TYPE;
+ DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type,
+ (uint) scanner.page[scanner.page_offset]));
+ }
+ if (scanner.page[scanner.page_offset] == 0)
+ DBUG_RETURN(LSN_IMPOSSIBLE); /* reached page filler */
+ DBUG_RETURN(scanner.page_addr + scanner.page_offset);
+}
+
+/**
+ @brief returns the LSN of the first record starting in this log
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no log or the log is empty
+ @retval # LSN of the first record
+*/
+
+LSN translog_first_lsn_in_log()
+{
+ TRANSLOG_ADDRESS addr, horizon= translog_get_horizon();
+ TRANSLOG_VALIDATOR_DATA data;
+ uint file;
+ uint16 chunk_offset;
+ uchar *page;
+ DBUG_ENTER("translog_first_lsn_in_log");
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr)));
+ DBUG_ASSERT(translog_inited == 1);
+
+ if (!(file= translog_first_file(horizon, 0)))
+ {
+ /* log has no records yet */
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ }
+
+ addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+ data.addr= &addr;
+ {
+ uchar buffer[TRANSLOG_PAGE_SIZE];
+ if ((page= translog_get_page(&data, buffer)) == NULL ||
+ (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+ DBUG_RETURN(LSN_ERROR);
+ }
+ addr+= chunk_offset;
+
+ DBUG_RETURN(translog_next_LSN(addr, horizon));
+}
+
+
+/**
+ @brief returns theoretical first LSN if first log is present
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no log
+ @retval # LSN of the first record
+*/
+
+LSN translog_first_theoretical_lsn()
+{
+ TRANSLOG_ADDRESS addr= translog_get_horizon();
+ uchar buffer[TRANSLOG_PAGE_SIZE], *page;
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_first_theoretical_lsn");
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr)));
+ DBUG_ASSERT(translog_inited == 1);
+
+ if (!translog_is_file(1))
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE))
+ {
+ /* log has no records yet */
+ DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+ log_descriptor.page_overhead));
+ }
+
+ addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+ data.addr= &addr;
+ if ((page= translog_get_page(&data, buffer)) == NULL)
+ DBUG_RETURN(LSN_ERROR);
+
+ DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+ page_overhead[page[TRANSLOG_PAGE_FLAGS]]));
+}
+
+
+/**
+ @brief Check given low water mark and purge files if it is need
+
+ @param low the last (minimum) address which is need
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_purge(TRANSLOG_ADDRESS low)
+{
+ uint32 last_need_file= LSN_FILE_NO(low);
+ TRANSLOG_ADDRESS horizon= translog_get_horizon();
+ int rc= 0;
+ DBUG_ENTER("translog_purge");
+ DBUG_PRINT("enter", ("low: (%lu,0x%lx)", LSN_IN_PARTS(low)));
+ DBUG_ASSERT(translog_inited == 1);
+
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+ if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
+ {
+ uint32 i;
+ uint32 min_file= translog_first_file(horizon, 1);
+ DBUG_ASSERT(min_file != 0); /* log is already started */
+
+ for(i= min_file; i < last_need_file && rc == 0; i++)
+ {
+ LSN lsn= translog_get_file_max_lsn_stored(i);
+ if (lsn == LSN_IMPOSSIBLE)
+ break; /* files are still in writing */
+ if (lsn == LSN_ERROR)
+ {
+ rc= 1;
+ break;
+ }
+ if (cmp_translog_addr(lsn, low) >= 0)
+ break;
+ DBUG_PRINT("info", ("purge file %lu", (ulong) i));
+ {
+ char path[FN_REFLEN], *file_name;
+ file_name= translog_filename_by_fileno(i, path);
+ rc= test(my_delete(file_name, MYF(MY_WME)));
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(rc);
+}
diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h
new file mode 100644
index 00000000000..164ff013b10
--- /dev/null
+++ b/storage/maria/ma_loghandler.h
@@ -0,0 +1,364 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _ma_loghandler_h
+#define _ma_loghandler_h
+
+/* transaction log default cache size (TODO: make it global variable) */
+#define TRANSLOG_PAGECACHE_SIZE 1024*1024*2
+/* transaction log default file size (TODO: make it global variable) */
+#define TRANSLOG_FILE_SIZE 1024*1024*1024
+/* transaction log default flags (TODO: make it global variable) */
+#define TRANSLOG_DEFAULT_FLAGS 0
+
+/* Transaction log flags */
+#define TRANSLOG_PAGE_CRC 1
+#define TRANSLOG_SECTOR_PROTECTION (1<<1)
+#define TRANSLOG_RECORD_CRC (1<<2)
+#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \
+ TRANSLOG_RECORD_CRC) + 1)
+
+#define RECHEADER_READ_ERROR -1
+#define RECHEADER_READ_EOF -2
+
+/*
+ Page size in transaction log
+ It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE
+ (DISK_DRIVE_SECTOR_SIZE * 2^N)
+*/
+#define TRANSLOG_PAGE_SIZE (8*1024)
+
+#include "ma_loghandler_lsn.h"
+#include "trnman_public.h"
+
+/* short transaction ID type */
+typedef uint16 SHORT_TRANSACTION_ID;
+
+struct st_maria_info;
+
+/* Length of CRC at end of pages */
+#define CRC_LENGTH 4
+/* Size of file id in logs */
+#define FILEID_STORE_SIZE 2
+/* Size of page reference in log */
+#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE
+/* Size of page ranges in log */
+#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE
+#define DIRPOS_STORE_SIZE 1
+
+/* Store methods to match the above sizes */
+#define fileid_store(T,A) int2store(T,A)
+#define page_store(T,A) int5store(T,A)
+#define dirpos_store(T,A) ((*(uchar*) (T)) = A)
+#define pagerange_store(T,A) int2store(T,A)
+#define fileid_korr(P) uint2korr(P)
+#define page_korr(P) uint5korr(P)
+#define dirpos_korr(P) ((P)[0])
+#define pagerange_korr(P) uint2korr(P)
+
+/*
+ Length of disk drive sector size (we assume that writing it
+ to disk is atomic operation)
+*/
+#define DISK_DRIVE_SECTOR_SIZE 512
+
+/*
+ Number of empty entries we need to have in LEX_STRING for
+ translog_write_record()
+*/
+#define LOG_INTERNAL_PARTS 1
+
+/* position reserved in an array of parts of a log record */
+#define TRANSLOG_INTERNAL_PARTS 2
+
+/* types of records in the transaction log */
+/* Todo: Set numbers for these when we have all entries figured out */
+
+enum translog_record_type
+{
+ LOGREC_RESERVED_FOR_CHUNKS23= 0,
+ LOGREC_REDO_INSERT_ROW_HEAD,
+ LOGREC_REDO_INSERT_ROW_TAIL,
+ LOGREC_REDO_INSERT_ROW_BLOB,
+ LOGREC_REDO_INSERT_ROW_BLOBS,
+ LOGREC_REDO_PURGE_ROW_HEAD,
+ LOGREC_REDO_PURGE_ROW_TAIL,
+ LOGREC_REDO_PURGE_BLOCKS,
+ LOGREC_REDO_DELETE_ROW,
+ LOGREC_REDO_UPDATE_ROW_HEAD,
+ LOGREC_REDO_INDEX,
+ LOGREC_REDO_UNDELETE_ROW,
+ LOGREC_CLR_END,
+ LOGREC_PURGE_END,
+ LOGREC_UNDO_ROW_INSERT,
+ LOGREC_UNDO_ROW_DELETE,
+ LOGREC_UNDO_ROW_UPDATE,
+ LOGREC_UNDO_KEY_INSERT,
+ LOGREC_UNDO_KEY_DELETE,
+ LOGREC_PREPARE,
+ LOGREC_PREPARE_WITH_UNDO_PURGE,
+ LOGREC_COMMIT,
+ LOGREC_COMMIT_WITH_UNDO_PURGE,
+ LOGREC_CHECKPOINT,
+ LOGREC_REDO_CREATE_TABLE,
+ LOGREC_REDO_RENAME_TABLE,
+ LOGREC_REDO_DROP_TABLE,
+ LOGREC_REDO_DELETE_ALL,
+ LOGREC_REDO_REPAIR_TABLE,
+ LOGREC_FILE_ID,
+ LOGREC_LONG_TRANSACTION_ID,
+ LOGREC_RESERVED_FUTURE_EXTENSION= 63
+};
+#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */
+
+/* Size of log file; One log file is restricted to 4G */
+typedef uint32 translog_size_t;
+
+#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024
+
+typedef struct st_translog_group_descriptor
+{
+ TRANSLOG_ADDRESS addr;
+ uint8 num;
+} TRANSLOG_GROUP;
+
+
+typedef struct st_translog_header_buffer
+{
+ /* LSN of the read record */
+ LSN lsn;
+ /* array of groups descriptors, can be used only if groups_no > 0 */
+ TRANSLOG_GROUP *groups;
+ /* short transaction ID or 0 if it has no sense for the record */
+ SHORT_TRANSACTION_ID short_trid;
+ /*
+ The Record length in buffer (including read header, but excluding
+ hidden part of record (type, short TrID, length)
+ */
+ translog_size_t record_length;
+ /*
+ Buffer for write decoded header of the record (depend on the record
+ type)
+ */
+ uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE];
+ /* number of groups listed in */
+ uint groups_no;
+ /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */
+ uint chunk0_pages;
+ /* type of the read record */
+ enum translog_record_type type;
+ /* chunk 0 data address (valid only if groups_no > 0) */
+ TRANSLOG_ADDRESS chunk0_data_addr;
+ /*
+ Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>)
+ */
+ int16 compressed_LSN_economy;
+ /* short transaction ID or 0 if it has no sense for the record */
+ uint16 non_header_data_start_offset;
+ /* non read body data length in this first chunk */
+ uint16 non_header_data_len;
+ /* chunk 0 data size (valid only if groups_no > 0) */
+ uint16 chunk0_data_len;
+} TRANSLOG_HEADER_BUFFER;
+
+
+typedef struct st_translog_scanner_data
+{
+ uchar buffer[TRANSLOG_PAGE_SIZE]; /* buffer for page content */
+ TRANSLOG_ADDRESS page_addr; /* current page address */
+ /* end of the log which we saw last time */
+ TRANSLOG_ADDRESS horizon;
+ TRANSLOG_ADDRESS last_file_page; /* Last page on in this file */
+ uchar *page; /* page content pointer */
+ /* offset of the chunk in the page */
+ translog_size_t page_offset;
+ /* set horizon only once at init */
+ my_bool fixed_horizon;
+} TRANSLOG_SCANNER_DATA;
+
+
+struct st_translog_reader_data
+{
+ TRANSLOG_HEADER_BUFFER header; /* Header */
+ TRANSLOG_SCANNER_DATA scanner; /* chunks scanner */
+ translog_size_t body_offset; /* current chunk body offset */
+ /* data offset from the record beginning */
+ translog_size_t current_offset;
+ /* number of bytes read in header */
+ uint16 read_header;
+ uint16 chunk_size; /* current chunk size */
+ uint current_group; /* current group */
+ uint current_chunk; /* current chunk in the group */
+ my_bool eor; /* end of the record */
+};
+
+struct st_transaction;
+C_MODE_START
+
+/* Records types for unittests */
+#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1
+#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2
+#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3
+#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4
+#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5
+#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6
+
+extern void example_loghandler_init();
+
+extern my_bool translog_init(const char *directory, uint32 log_file_max_size,
+ uint32 server_version, uint32 server_id,
+ PAGECACHE *pagecache, uint flags);
+
+extern my_bool
+translog_write_record(LSN *lsn, enum translog_record_type type,
+ struct st_transaction *trn,
+ struct st_maria_info *tbl_info,
+ translog_size_t rec_len, uint part_no,
+ LEX_STRING *parts_data, uchar *store_share_id);
+
+extern void translog_destroy();
+
+extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff);
+
+extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff);
+
+extern translog_size_t translog_read_record(LSN lsn,
+ translog_size_t offset,
+ translog_size_t length,
+ uchar *buffer,
+ struct st_translog_reader_data
+ *data);
+
+extern my_bool translog_flush(LSN lsn);
+
+extern my_bool translog_init_scanner(LSN lsn,
+ my_bool fixed_horizon,
+ struct st_translog_scanner_data *scanner);
+
+extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff);
+extern LSN translog_get_file_max_lsn_stored(uint32 file);
+extern my_bool translog_purge(TRANSLOG_ADDRESS low);
+extern my_bool translog_is_file(uint file_no);
+extern my_bool translog_lock();
+extern my_bool translog_unlock();
+extern void translog_lock_assert_owner();
+extern TRANSLOG_ADDRESS translog_get_horizon();
+extern TRANSLOG_ADDRESS translog_get_horizon_no_lock();
+extern int translog_assign_id_to_share(struct st_maria_info *tbl_info,
+ struct st_transaction *trn);
+extern void translog_deassign_id_from_share(struct st_maria_share *share);
+extern void
+translog_assign_id_to_share_from_recovery(struct st_maria_share *share,
+ uint16 id);
+extern my_bool translog_inited;
+
+/*
+ all the rest added because of recovery; should we make
+ ma_loghandler_for_recovery.h ?
+*/
+
+#define SHARE_ID_MAX 65535 /* array's size */
+
+extern LSN translog_first_lsn_in_log();
+extern LSN translog_first_theoretical_lsn();
+extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon);
+
+/* record parts descriptor */
+struct st_translog_parts
+{
+ /* full record length */
+ translog_size_t record_length;
+ /* full record length with chunk headers */
+ translog_size_t total_record_length;
+ /* current part index */
+ uint current;
+ /* total number of elements in parts */
+ uint elements;
+ /* array of parts (LEX_STRING) */
+ LEX_STRING *parts;
+};
+
+typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type,
+ TRN *trn, struct st_maria_info *tbl_info,
+ struct st_translog_parts *parts);
+
+typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type,
+ TRN *trn, struct st_maria_info *tbl_info,
+ LSN *lsn,
+ struct st_translog_parts *parts);
+
+typedef uint16(*read_rec_hook) (enum translog_record_type type,
+ uint16 read_length, uchar *read_buff,
+ uchar *decoded_buff);
+
+
+/* record classes */
+enum record_class
+{
+ LOGRECTYPE_NOT_ALLOWED,
+ LOGRECTYPE_VARIABLE_LENGTH,
+ LOGRECTYPE_PSEUDOFIXEDLENGTH,
+ LOGRECTYPE_FIXEDLENGTH
+};
+
+/* C++ can't bear that a variable's name is "class" */
+#ifndef __cplusplus
+
+enum enum_record_in_group {
+ LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF
+};
+
+/*
+ Descriptor of log record type
+ Note: Don't reorder because of constructs later...
+*/
+typedef struct st_log_record_type_descriptor
+{
+ /* internal class of the record */
+ enum record_class class;
+ /*
+ length for fixed-size record, pseudo-fixed record
+ length with uncompressed LSNs
+ */
+ uint16 fixed_length;
+ /* how much record body (belonged to headers too) read with headers */
+ uint16 read_header_len;
+ /* HOOK for writing the record called before lock */
+ prewrite_rec_hook prewrite_hook;
+ /* HOOK for writing the record called when LSN is known, inside lock */
+ inwrite_rec_hook inwrite_hook;
+ /* HOOK for reading headers */
+ read_rec_hook read_hook;
+ /*
+ For pseudo fixed records number of compressed LSNs followed by
+ system header
+ */
+ int16 compressed_LSN;
+ /* the rest is for maria_read_log & Recovery */
+ /** @brief for debug error messages or "maria_read_log" command-line tool */
+ const char *name;
+ enum enum_record_in_group record_in_group;
+ /* a function to execute when we see the record during the REDO phase */
+ int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *);
+ /* a function to execute when we see the record during the UNDO phase */
+ int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *);
+} LOG_DESC;
+
+extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+#endif
+
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h
new file mode 100644
index 00000000000..e019be16fd2
--- /dev/null
+++ b/storage/maria/ma_loghandler_lsn.h
@@ -0,0 +1,100 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _ma_loghandler_lsn_h
+#define _ma_loghandler_lsn_h
+
+/*
+ Transaction log record address:
+ file_no << 32 | offset
+ file_no is only 3 bytes so we can use signed integer to make
+ comparison more simple.
+*/
+typedef int64 TRANSLOG_ADDRESS;
+
+/*
+ Compare addresses
+ A1 > A2 -> result > 0
+ A1 == A2 -> 0
+ A1 < A2 -> result < 0
+*/
+#define cmp_translog_addr(A1,A2) ((A1) - (A2))
+
+/* LSN type (address of certain log record chank */
+typedef TRANSLOG_ADDRESS LSN;
+
+/* Gets file number part of a LSN/log address */
+#define LSN_FILE_NO(L) ((L) >> 32)
+
+/* Gets raw file number part of a LSN/log address */
+#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL))
+
+/* Parts of LSN for printing */
+#define LSN_IN_PARTS(L) (ulong)LSN_FILE_NO(L),(ulong)LSN_OFFSET(L)
+
+/* Gets record offset of a LSN/log address */
+#define LSN_OFFSET(L) ((L) & 0xFFFFFFFFL)
+
+/* Makes lsn/log address from file number and record offset */
+#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S)))
+
+/* checks LSN */
+#define LSN_VALID(L) \
+ ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) && \
+ (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE))
+
+/* size of stored LSN on a disk, don't change it! */
+#define LSN_STORE_SIZE 7
+
+/* Puts LSN into buffer (dst) */
+#define lsn_store(dst, lsn) \
+ do { \
+ int3store((dst), LSN_FILE_NO(lsn)); \
+ int4store((dst) + 3, LSN_OFFSET(lsn)); \
+ } while (0)
+
+/* Unpacks LSN from the buffer (P) */
+#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((P) + 3))
+
+/* what we need to add to LSN to increase it on one file */
+#define LSN_ONE_FILE ((int64)0x100000000LL)
+
+#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S))
+
+/*
+ an 8-byte type whose most significant uchar is used for "flags"; 7
+ other bytes are a LSN.
+*/
+typedef LSN LSN_WITH_FLAGS;
+#define LSN_WITH_FLAGS_TO_LSN(x) (x & ULL(0x00FFFFFFFFFFFFFF))
+#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & ULL(0xFF00000000000000))
+
+#define FILENO_IMPOSSIBLE 0 /**< log file's numbering starts at 1 */
+#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */
+#define LSN_IMPOSSIBLE 0
+/* following LSN also is impossible */
+#define LSN_ERROR 1
+
+/** @brief some impossible LSN serve as markers */
+#define LSN_REPAIRED_BY_MARIA_CHK ((LSN)2)
+
+/**
+ @brief the maximum valid LSN.
+ Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs
+ (ULONGLONG_MAX is too big for correctness of cmp_translog_address()).
+*/
+#define LSN_MAX (LSN)ULL(0x00FFFFFFFFFFFFFF)
+
+#endif
diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c
new file mode 100644
index 00000000000..9b665cfb958
--- /dev/null
+++ b/storage/maria/ma_open.c
@@ -0,0 +1,1577 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* open a isam-database */
+
+#include "ma_fulltext.h"
+#include "ma_sp_defs.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include <m_ctype.h>
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h> /* Prototype for getpid */
+#endif
+#endif
+#ifdef VMS
+#include "static.c"
+#endif
+
+static void setup_key_functions(MARIA_KEYDEF *keyinfo);
+static my_bool maria_scan_init_dummy(MARIA_HA *info);
+static void maria_scan_end_dummy(MARIA_HA *info);
+static my_bool maria_once_init_dummy(MARIA_SHARE *, File);
+static my_bool maria_once_end_dummy(MARIA_SHARE *);
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state);
+
+#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \
+ pos+=size;}
+
+
+#define disk_pos_assert(pos, end_pos) \
+if (pos > end_pos) \
+{ \
+ my_errno=HA_ERR_CRASHED; \
+ goto err; \
+}
+
+
+/******************************************************************************
+** Return the shared struct if the table is already open.
+** In MySQL the server will handle version issues.
+******************************************************************************/
+
+MARIA_HA *_ma_test_if_reopen(char *filename)
+{
+ LIST *pos;
+
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info=(MARIA_HA*) pos->data;
+ MARIA_SHARE *share=info->s;
+ if (!strcmp(share->unique_file_name,filename) && share->last_version)
+ return info;
+ }
+ return 0;
+}
+
+
+/*
+ Open a new instance of an already opened Maria table
+
+ SYNOPSIS
+ maria_clone_internal()
+ share Share of already open table
+ mode Mode of table (O_RDONLY | O_RDWR)
+ data_file Filedescriptor of data file to use < 0 if one should open
+ open it.
+
+ RETURN
+ # Maria handler
+ 0 Error
+*/
+
+
+static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, int mode,
+ File data_file)
+{
+ int save_errno;
+ uint errpos;
+ MARIA_HA info,*m_info;
+ my_bitmap_map *changed_fields_bitmap;
+ DBUG_ENTER("maria_clone_internal");
+
+ errpos= 0;
+ bzero((uchar*) &info,sizeof(info));
+
+ if (mode == O_RDWR && share->mode == O_RDONLY)
+ {
+ my_errno=EACCES; /* Can't open in write mode */
+ goto err;
+ }
+ if (data_file >= 0)
+ info.dfile.file= data_file;
+ else if (_ma_open_datafile(&info, share, -1))
+ goto err;
+ errpos= 5;
+
+ /* alloc and set up private structure parts */
+ if (!my_multi_malloc(MY_WME,
+ &m_info,sizeof(MARIA_HA),
+ &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+ &info.buff,(share->base.max_key_block_length*2+
+ share->base.max_key_length),
+ &info.lastkey,share->base.max_key_length*3+1,
+ &info.first_mbr_key, share->base.max_key_length,
+ &info.maria_rtree_recursion_state,
+ share->have_rtree ? 1024 : 0,
+ &changed_fields_bitmap,
+ bitmap_buffer_size(share->base.fields),
+ NullS))
+ goto err;
+ errpos= 6;
+
+ memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs);
+ info.lastkey2=info.lastkey+share->base.max_key_length;
+
+ info.s=share;
+ info.cur_row.lastpos= HA_OFFSET_ERROR;
+ info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND);
+ info.opt_flag=READ_CHECK_USED;
+ info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */
+ if (share->data_file_type == COMPRESSED_RECORD)
+ info.this_unique= share->state.unique;
+ info.this_loop=0; /* Update counter */
+ info.last_unique= share->state.unique;
+ info.last_loop= share->state.update_count;
+ info.lock_type=F_UNLCK;
+ info.quick_mode=0;
+ info.bulk_insert=0;
+ info.ft1_to_ft2=0;
+ info.errkey= -1;
+ info.page_changed=1;
+ info.keyread_buff= info.buff + share->base.max_key_block_length;
+ bitmap_init(&info.changed_fields, changed_fields_bitmap,
+ share->base.fields, 0);
+ if ((*share->init)(&info))
+ goto err;
+
+ pthread_mutex_lock(&share->intern_lock);
+ info.read_record= share->read_record;
+ share->reopen++;
+ share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL);
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ info.lock_type=F_RDLCK;
+ share->r_locks++;
+ share->tot_locks++;
+ }
+ if (share->options & HA_OPTION_TMP_TABLE)
+ {
+ share->temporary= share->delay_key_write= 1;
+
+ share->write_flag=MYF(MY_NABP);
+ share->w_locks++; /* We don't have to update status */
+ share->tot_locks++;
+ info.lock_type=F_WRLCK;
+ }
+ if ((share->options & HA_OPTION_DELAY_KEY_WRITE) &&
+ maria_delay_key_write)
+ share->delay_key_write=1;
+
+ info.state= &share->state.state; /* Change global values by default */
+ if (!share->base.born_transactional) /* but for transactional ones ... */
+ info.trn= &dummy_transaction_object; /* ... force crash if no trn given */
+ pthread_mutex_unlock(&share->intern_lock);
+
+ /* Allocate buffer for one record */
+ /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */
+ if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size,
+ share->base.default_rec_buff_size))
+ goto err;
+
+ bzero(info.rec_buff, share->base.default_rec_buff_size);
+
+ *m_info=info;
+#ifdef THREAD
+ thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info);
+#endif
+ m_info->open_list.data=(void*) m_info;
+ maria_open_list=list_add(maria_open_list,&m_info->open_list);
+
+ DBUG_RETURN(m_info);
+
+err:
+ save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+ if ((save_errno == HA_ERR_CRASHED) ||
+ (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+ (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+ _ma_report_error(save_errno, share->open_file_name);
+ switch (errpos) {
+ case 6:
+ (*share->end)(&info);
+ my_free((uchar*) m_info,MYF(0));
+ /* fall through */
+ case 5:
+ if (data_file < 0)
+ VOID(my_close(info.dfile.file, MYF(0)));
+ break;
+ }
+ my_errno=save_errno;
+ DBUG_RETURN (NULL);
+} /* maria_clone_internal */
+
+
+/* Make a clone of a maria table */
+
+MARIA_HA *maria_clone(MARIA_SHARE *share, int mode)
+{
+ MARIA_HA *new_info;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ new_info= maria_clone_internal(share, mode,
+ share->data_file_type == BLOCK_RECORD ?
+ share->bitmap.file.file : -1);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ return new_info;
+}
+
+
+/******************************************************************************
+ open a MARIA table
+
+ See my_base.h for the handle_locking argument
+ if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table
+ is marked crashed or if we are not using locking and the table doesn't
+ have an open count of 0.
+******************************************************************************/
+
+MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
+{
+ int kfile,open_mode,save_errno;
+ uint i,j,len,errpos,head_length,base_pos,info_length,keys,
+ key_parts,unique_key_parts,fulltext_keys,uniques;
+ char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN],
+ data_name[FN_REFLEN];
+ char *disk_cache, *disk_pos, *end_pos;
+ MARIA_HA info,*m_info,*old_info;
+ MARIA_SHARE share_buff,*share;
+ ulong rec_per_key_part[HA_MAX_POSSIBLE_KEY*HA_MAX_KEY_SEG];
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY];
+ ulonglong max_key_file_length, max_data_file_length;
+ File data_file= -1;
+ DBUG_ENTER("maria_open");
+
+ LINT_INIT(m_info);
+ kfile= -1;
+ errpos= 0;
+ head_length=sizeof(share_buff.state.header);
+ bzero((uchar*) &info,sizeof(info));
+
+ my_realpath(name_buff, fn_format(org_name,name,"",MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME),MYF(0));
+ pthread_mutex_lock(&THR_LOCK_maria);
+ old_info= 0;
+ if ((open_flags & HA_OPEN_COPY) ||
+ !(old_info=_ma_test_if_reopen(name_buff)))
+ {
+ share= &share_buff;
+ bzero((uchar*) &share_buff,sizeof(share_buff));
+ share_buff.state.rec_per_key_part=rec_per_key_part;
+ share_buff.state.key_root=key_root;
+ share_buff.pagecache= multi_pagecache_search(name_buff, strlen(name_buff),
+ maria_pagecache);
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open",
+ if (strstr(name, "/t1"))
+ {
+ my_errno= HA_ERR_CRASHED;
+ goto err;
+ });
+ if ((kfile=my_open(name_buff,(open_mode=O_RDWR) | O_SHARE,MYF(0))) < 0)
+ {
+ if ((errno != EROFS && errno != EACCES) ||
+ mode != O_RDONLY ||
+ (kfile=my_open(name_buff,(open_mode=O_RDONLY) | O_SHARE,MYF(0))) < 0)
+ goto err;
+ }
+ share->mode=open_mode;
+ errpos= 1;
+ if (my_read(kfile,(char*) share->state.header.file_version,head_length,
+ MYF(MY_NABP)))
+ {
+ my_errno= HA_ERR_NOT_A_TABLE;
+ goto err;
+ }
+ if (memcmp((uchar*) share->state.header.file_version,
+ (uchar*) maria_file_magic, 4))
+ {
+ DBUG_PRINT("error",("Wrong header in %s",name_buff));
+ DBUG_DUMP("error_dump",(char*) share->state.header.file_version,
+ head_length);
+ my_errno=HA_ERR_NOT_A_TABLE;
+ goto err;
+ }
+ share->options= mi_uint2korr(share->state.header.options);
+ if (share->options &
+ ~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS |
+ HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA |
+ HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM |
+ HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
+ HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS))
+ {
+ DBUG_PRINT("error",("wrong options: 0x%lx", share->options));
+ my_errno=HA_ERR_OLD_FILE;
+ goto err;
+ }
+ if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) &&
+ ! (open_flags & HA_OPEN_FROM_SQL_LAYER))
+ {
+ DBUG_PRINT("error", ("table cannot be openned from non-sql layer"));
+ my_errno= HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+ /* Don't call realpath() if the name can't be a link */
+ if (!strcmp(name_buff, org_name) ||
+ my_readlink(index_name, org_name, MYF(0)) == -1)
+ (void) strmov(index_name, org_name);
+ *strrchr(org_name, '.')= '\0';
+ (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT,
+ MY_APPEND_EXT|MY_UNPACK_FILENAME|MY_RESOLVE_SYMLINKS);
+
+ info_length=mi_uint2korr(share->state.header.header_length);
+ base_pos= mi_uint2korr(share->state.header.base_pos);
+ if (!(disk_cache=(char*) my_alloca(info_length+128)))
+ {
+ my_errno=ENOMEM;
+ goto err;
+ }
+ end_pos=disk_cache+info_length;
+ errpos= 2;
+
+ VOID(my_seek(kfile,0L,MY_SEEK_SET,MYF(0)));
+ errpos= 3;
+ if (my_read(kfile,disk_cache,info_length,MYF(MY_NABP)))
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+ len=mi_uint2korr(share->state.header.state_info_length);
+ keys= (uint) share->state.header.keys;
+ uniques= (uint) share->state.header.uniques;
+ fulltext_keys= (uint) share->state.header.fulltext_keys;
+ key_parts= mi_uint2korr(share->state.header.key_parts);
+ unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts);
+ if (len != MARIA_STATE_INFO_SIZE)
+ {
+ DBUG_PRINT("warning",
+ ("saved_state_info_length: %d state_info_length: %d",
+ len,MARIA_STATE_INFO_SIZE));
+ }
+ share->state_diff_length=len-MARIA_STATE_INFO_SIZE;
+
+ _ma_state_info_read(disk_cache, &share->state);
+ len= mi_uint2korr(share->state.header.base_info_length);
+ if (len != MARIA_BASE_INFO_SIZE)
+ {
+ DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d",
+ len,MARIA_BASE_INFO_SIZE));
+ }
+ disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base);
+ share->state.state_length=base_pos;
+
+ if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
+ ((share->state.changed & STATE_CRASHED) ||
+ ((open_flags & HA_OPEN_ABORT_IF_CRASHED) &&
+ (my_disable_locking && share->state.open_count))))
+ {
+ DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u "
+ "changed: %u open_count: %u !locking: %d",
+ open_flags, share->state.changed,
+ share->state.open_count, my_disable_locking));
+ my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+ HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+ goto err;
+ }
+
+ /* sanity check */
+ if (share->base.keystart > 65535 || share->base.rec_reflength > 8)
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+
+ key_parts+=fulltext_keys*FT_SEGS;
+ if (share->base.max_key_length > maria_max_key_length() ||
+ keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG)
+ {
+ DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts));
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+ /*
+ If page cache is not initialized, then assume we will create it
+ after the table is opened!
+ */
+ if (share->base.block_size != maria_block_size &&
+ share_buff.pagecache->inited != 0)
+ {
+ DBUG_PRINT("error", ("Wrong block size %u; Expected %u",
+ (uint) share->base.block_size,
+ (uint) maria_block_size));
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+
+ /* Correct max_file_length based on length of sizeof(off_t) */
+ max_data_file_length=
+ (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ?
+ (((ulonglong) 1 << (share->base.rec_reflength*8))-1) :
+ (_ma_safe_mul(share->base.pack_reclength,
+ (ulonglong) 1 << (share->base.rec_reflength*8))-1);
+
+ max_key_file_length=
+ _ma_safe_mul(MARIA_MIN_KEY_BLOCK_LENGTH,
+ ((ulonglong) 1 << (share->base.key_reflength*8))-1);
+#if SIZEOF_OFF_T == 4
+ set_if_smaller(max_data_file_length, INT_MAX32);
+ set_if_smaller(max_key_file_length, INT_MAX32);
+#endif
+ share->base.max_data_file_length=(my_off_t) max_data_file_length;
+ share->base.max_key_file_length=(my_off_t) max_key_file_length;
+
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ share->base.max_key_length+=2; /* For safety */
+
+ if (!my_multi_malloc(MY_WME,
+ &share,sizeof(*share),
+ &share->state.rec_per_key_part,sizeof(long)*key_parts,
+ &share->keyinfo,keys*sizeof(MARIA_KEYDEF),
+ &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF),
+ &share->keyparts,
+ (key_parts+unique_key_parts+keys+uniques) *
+ sizeof(HA_KEYSEG),
+ &share->columndef,
+ (share->base.fields+1)*sizeof(MARIA_COLUMNDEF),
+ &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+ &share->unique_file_name,strlen(name_buff)+1,
+ &share->index_file_name,strlen(index_name)+1,
+ &share->data_file_name,strlen(data_name)+1,
+ &share->open_file_name,strlen(name)+1,
+ &share->state.key_root,keys*sizeof(my_off_t),
+#ifdef THREAD
+ &share->key_root_lock,sizeof(rw_lock_t)*keys,
+#endif
+ &share->mmap_lock,sizeof(rw_lock_t),
+ NullS))
+ goto err;
+ errpos= 4;
+
+ *share=share_buff;
+ memcpy((char*) share->state.rec_per_key_part,
+ (char*) rec_per_key_part, sizeof(long)*key_parts);
+ memcpy((char*) share->state.key_root,
+ (char*) key_root, sizeof(my_off_t)*keys);
+ strmov(share->unique_file_name, name_buff);
+ share->unique_name_length= strlen(name_buff);
+ strmov(share->index_file_name, index_name);
+ strmov(share->data_file_name, data_name);
+ strmov(share->open_file_name, name);
+
+ share->block_size= share->base.block_size;
+ {
+ HA_KEYSEG *pos=share->keyparts;
+ for (i=0 ; i < keys ; i++)
+ {
+ share->keyinfo[i].share= share;
+ disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]);
+ disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE,
+ end_pos);
+ if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE)
+ share->have_rtree= 1;
+ share->keyinfo[i].seg=pos;
+ for (j=0 ; j < share->keyinfo[i].keysegs; j++,pos++)
+ {
+ disk_pos=_ma_keyseg_read(disk_pos, pos);
+ if (pos->type == HA_KEYTYPE_TEXT ||
+ pos->type == HA_KEYTYPE_VARTEXT1 ||
+ pos->type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (!pos->language)
+ pos->charset=default_charset_info;
+ else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+ {
+ my_errno=HA_ERR_UNKNOWN_CHARSET;
+ goto err;
+ }
+ }
+ else if (pos->type == HA_KEYTYPE_BINARY)
+ pos->charset= &my_charset_bin;
+ }
+ if (share->keyinfo[i].flag & HA_SPATIAL)
+ {
+#ifdef HAVE_SPATIAL
+ uint sp_segs=SPDIMS*2;
+ share->keyinfo[i].seg=pos-sp_segs;
+ share->keyinfo[i].keysegs--;
+#else
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+#endif
+ }
+ else if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if (!fulltext_keys)
+ { /* 4.0 compatibility code, to be removed in 5.0 */
+ share->keyinfo[i].seg=pos-FT_SEGS;
+ share->keyinfo[i].keysegs-=FT_SEGS;
+ }
+ else
+ {
+ uint k;
+ share->keyinfo[i].seg=pos;
+ for (k=0; k < FT_SEGS; k++)
+ {
+ *pos= ft_keysegs[k];
+ pos[0].language= pos[-1].language;
+ if (!(pos[0].charset= pos[-1].charset))
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+ pos++;
+ }
+ }
+ if (!share->ft2_keyinfo.seg)
+ {
+ memcpy(& share->ft2_keyinfo, & share->keyinfo[i], sizeof(MARIA_KEYDEF));
+ share->ft2_keyinfo.keysegs=1;
+ share->ft2_keyinfo.flag=0;
+ share->ft2_keyinfo.keylength=
+ share->ft2_keyinfo.minlength=
+ share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength;
+ share->ft2_keyinfo.seg=pos-1;
+ share->ft2_keyinfo.end=pos;
+ setup_key_functions(& share->ft2_keyinfo);
+ }
+ }
+ setup_key_functions(share->keyinfo+i);
+ share->keyinfo[i].end=pos;
+ pos->type=HA_KEYTYPE_END; /* End */
+ pos->length=share->base.rec_reflength;
+ pos->null_bit=0;
+ pos->flag=0; /* For purify */
+ pos++;
+ }
+ for (i=0 ; i < uniques ; i++)
+ {
+ disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]);
+ disk_pos_assert(disk_pos + share->uniqueinfo[i].keysegs *
+ HA_KEYSEG_SIZE, end_pos);
+ share->uniqueinfo[i].seg=pos;
+ for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++)
+ {
+ disk_pos=_ma_keyseg_read(disk_pos, pos);
+ if (pos->type == HA_KEYTYPE_TEXT ||
+ pos->type == HA_KEYTYPE_VARTEXT1 ||
+ pos->type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (!pos->language)
+ pos->charset=default_charset_info;
+ else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+ {
+ my_errno=HA_ERR_UNKNOWN_CHARSET;
+ goto err;
+ }
+ }
+ }
+ share->uniqueinfo[i].end=pos;
+ pos->type=HA_KEYTYPE_END; /* End */
+ pos->null_bit=0;
+ pos->flag=0;
+ pos++;
+ }
+ share->ftparsers= 0;
+ }
+ share->data_file_type= share->state.header.data_file_type;
+ share->base_length= (BASE_ROW_HEADER_SIZE +
+ share->base.is_nulls_extended +
+ share->base.null_bytes +
+ share->base.pack_bytes +
+ test(share->options & HA_OPTION_CHECKSUM));
+ if (open_flags & HA_OPEN_COPY)
+ {
+ /*
+ this instance will be a temporary one used just to create a data
+ file for REPAIR. Don't do logging. This base information will not go
+ to disk.
+ */
+ share->base.born_transactional= FALSE;
+ }
+ if (share->base.born_transactional)
+ {
+ share->page_type= PAGECACHE_LSN_PAGE;
+#ifdef ENABLE_WHEN_WE_HAVE_TRANS_ROW_ID /* QQ */
+ share->base_length+= TRANS_ROW_EXTRA_HEADER_SIZE;
+#endif
+ if (share->state.create_rename_lsn == LSN_REPAIRED_BY_MARIA_CHK)
+ {
+ /*
+ Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of
+ import into the server. It starts its existence (from the point of
+ view of the server, including server's recovery) now.
+ */
+ if ((open_flags & HA_OPEN_FROM_SQL_LAYER) || maria_in_recovery)
+ _ma_update_create_rename_lsn_sub(share, translog_get_horizon(),
+ TRUE);
+ }
+ else if ((!LSN_VALID(share->state.create_rename_lsn) ||
+ !LSN_VALID(share->state.is_of_horizon) ||
+ (cmp_translog_addr(share->state.create_rename_lsn,
+ share->state.is_of_horizon) > 0)) &&
+ !(open_flags & HA_OPEN_FOR_REPAIR))
+ {
+ /*
+ If in Recovery, it will not work. If LSN is invalid and not
+ LSN_REPAIRED_BY_MARIA_CHK, header must be corrupted.
+ In both cases, must repair.
+ */
+ my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+ HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+ goto err;
+ }
+ }
+ else
+ share->page_type= PAGECACHE_PLAIN_PAGE;
+ share->now_transactional= share->base.born_transactional;
+
+ share->base.default_rec_buff_size= max(share->base.pack_reclength,
+ share->base.max_key_length);
+ if (share->data_file_type == DYNAMIC_RECORD)
+ {
+ share->base.extra_rec_buff_size=
+ (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH +
+ MARIA_REC_BUFF_OFFSET);
+ share->base.default_rec_buff_size+= share->base.extra_rec_buff_size;
+ }
+ disk_pos_assert(disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE,
+ end_pos);
+ for (i= j= 0 ; i < share->base.fields ; i++)
+ {
+ disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]);
+ share->columndef[i].pack_type=0;
+ share->columndef[i].huff_tree=0;
+ if (share->columndef[i].type == (int) FIELD_BLOB)
+ {
+ share->blobs[j].pack_length=
+ share->columndef[i].length-portable_sizeof_char_ptr;;
+ share->blobs[j].offset= share->columndef[i].offset;
+ j++;
+ }
+ }
+ share->columndef[i].type=(int) FIELD_LAST; /* End marker */
+
+ if ((share->data_file_type == BLOCK_RECORD ||
+ share->data_file_type == COMPRESSED_RECORD))
+ {
+ if (_ma_open_datafile(&info, share, -1))
+ goto err;
+ data_file= info.dfile.file;
+ }
+ errpos= 5;
+
+ share->kfile.file= kfile;
+ share->this_process=(ulong) getpid();
+ share->last_process= share->state.process;
+ share->base.key_parts=key_parts;
+ share->base.all_key_parts=key_parts+unique_key_parts;
+ if (!(share->last_version=share->state.version))
+ share->last_version=1; /* Safety */
+ share->rec_reflength=share->base.rec_reflength; /* May be changed */
+ share->base.margin_key_file_length=(share->base.max_key_file_length -
+ (keys ? MARIA_INDEX_BLOCK_MARGIN *
+ share->block_size * keys : 0));
+ share->block_size= share->base.block_size;
+ my_afree((uchar*) disk_cache);
+ _ma_setup_functions(share);
+ if ((*share->once_init)(share, info.dfile.file))
+ goto err;
+ share->is_log_table= FALSE;
+ if (open_flags & HA_OPEN_TMP_TABLE)
+ share->options|= HA_OPTION_TMP_TABLE;
+ if (open_flags & HA_OPEN_DELAY_KEY_WRITE)
+ share->options|= HA_OPTION_DELAY_KEY_WRITE;
+ if (mode == O_RDONLY)
+ share->options|= HA_OPTION_READ_ONLY_DATA;
+
+#ifdef THREAD
+ thr_lock_init(&share->lock);
+ VOID(pthread_mutex_init(&share->intern_lock,MY_MUTEX_INIT_FAST));
+ for (i=0; i<keys; i++)
+ VOID(my_rwlock_init(&share->key_root_lock[i], NULL));
+ VOID(my_rwlock_init(&share->mmap_lock, NULL));
+ if (!thr_lock_inited)
+ {
+ /* Probably a single threaded program; Don't use concurrent inserts */
+ maria_concurrent_insert=0;
+ }
+ else if (maria_concurrent_insert)
+ {
+ share->concurrent_insert=
+ ((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE |
+ HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD)) ||
+ (open_flags & HA_OPEN_TMP_TABLE) ||
+ share->data_file_type == BLOCK_RECORD ||
+ share->have_rtree) ? 0 : 1;
+ if (share->concurrent_insert)
+ {
+ share->lock.get_status=_ma_get_status;
+ share->lock.copy_status=_ma_copy_status;
+ /**
+ @todo RECOVERY
+ INSERT DELAYED and concurrent inserts are currently disabled for
+ transactional tables; when enabled again, we should re-evaluate
+ what problems the call to _ma_update_status() by
+ thr_reschedule_write_lock() can do (it may hurt Checkpoint as it
+ would be without intern_lock, and it modifies the state).
+ */
+ share->lock.update_status=_ma_update_status;
+ share->lock.restore_status=_ma_restore_status;
+ share->lock.check_status=_ma_check_status;
+ }
+ }
+#endif
+ /*
+ Memory mapping can only be requested after initializing intern_lock.
+ */
+ if (open_flags & HA_OPEN_MMAP)
+ {
+ info.s= share;
+ maria_extra(&info, HA_EXTRA_MMAP, 0);
+ }
+ }
+ else
+ {
+ share= old_info->s;
+ if (share->data_file_type == BLOCK_RECORD)
+ data_file= share->bitmap.file.file; /* Only opened once */
+ }
+
+ if (!(m_info= maria_clone_internal(share, mode, data_file)))
+ goto err;
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_RETURN(m_info);
+
+err:
+ save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+ if ((save_errno == HA_ERR_CRASHED) ||
+ (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+ (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+ _ma_report_error(save_errno, name);
+ switch (errpos) {
+ case 5:
+ if (data_file >= 0)
+ VOID(my_close(data_file, MYF(0)));
+ if (old_info)
+ break; /* Don't remove open table */
+ (*share->once_end)(share);
+ /* fall through */
+ case 4:
+ my_free((uchar*) share,MYF(0));
+ /* fall through */
+ case 3:
+ /* fall through */
+ case 2:
+ my_afree((uchar*) disk_cache);
+ /* fall through */
+ case 1:
+ VOID(my_close(kfile,MYF(0)));
+ /* fall through */
+ case 0:
+ default:
+ break;
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ my_errno= save_errno;
+ DBUG_RETURN (NULL);
+} /* maria_open */
+
+
+/*
+ Reallocate a buffer, if the current buffer is not large enough
+*/
+
+my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+ size_t new_size)
+{
+ if (*old_size < new_size)
+ {
+ uchar *addr;
+ if (!(addr= (uchar*) my_realloc((uchar*) *old_addr, new_size,
+ MYF(MY_ALLOW_ZERO_PTR))))
+ return 1;
+ *old_addr= addr;
+ *old_size= new_size;
+ }
+ return 0;
+}
+
+
+ulonglong _ma_safe_mul(ulonglong a, ulonglong b)
+{
+ ulonglong max_val= ~ (ulonglong) 0; /* my_off_t is unsigned */
+
+ if (!a || max_val / a < b)
+ return max_val;
+ return a*b;
+}
+
+ /* Set up functions in structs */
+
+void _ma_setup_functions(register MARIA_SHARE *share)
+{
+ share->once_init= maria_once_init_dummy;
+ share->once_end= maria_once_end_dummy;
+ share->init= maria_scan_init_dummy;
+ share->end= maria_scan_end_dummy;
+ share->scan_init= maria_scan_init_dummy;/* Compat. dummy function */
+ share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */
+ share->write_record_init= _ma_write_init_default;
+ share->write_record_abort= _ma_write_abort_default;
+
+ switch (share->data_file_type) {
+ case COMPRESSED_RECORD:
+ share->read_record= _ma_read_pack_record;
+ share->scan= _ma_read_rnd_pack_record;
+ share->once_init= _ma_once_init_pack_row;
+ share->once_end= _ma_once_end_pack_row;
+ /*
+ Calculate checksum according to data in the original, not compressed,
+ row.
+ */
+ if (share->state.header.org_data_file_type == STATIC_RECORD &&
+ ! (share->options & HA_OPTION_NULL_FIELDS))
+ share->calc_checksum= _ma_static_checksum;
+ else
+ share->calc_checksum= _ma_checksum;
+ share->calc_write_checksum= share->calc_checksum;
+ break;
+ case DYNAMIC_RECORD:
+ share->read_record= _ma_read_dynamic_record;
+ share->scan= _ma_read_rnd_dynamic_record;
+ share->delete_record= _ma_delete_dynamic_record;
+ share->compare_record= _ma_cmp_dynamic_record;
+ share->compare_unique= _ma_cmp_dynamic_unique;
+ share->calc_checksum= share->calc_write_checksum= _ma_checksum;
+ /* add bits used to pack data to pack_reclength for faster allocation */
+ share->base.pack_reclength+= share->base.pack_bytes;
+ if (share->base.blobs)
+ {
+ share->update_record= _ma_update_blob_record;
+ share->write_record= _ma_write_blob_record;
+ }
+ else
+ {
+ share->write_record= _ma_write_dynamic_record;
+ share->update_record= _ma_update_dynamic_record;
+ }
+ break;
+ case STATIC_RECORD:
+ share->read_record= _ma_read_static_record;
+ share->scan= _ma_read_rnd_static_record;
+ share->delete_record= _ma_delete_static_record;
+ share->compare_record= _ma_cmp_static_record;
+ share->update_record= _ma_update_static_record;
+ share->write_record= _ma_write_static_record;
+ share->compare_unique= _ma_cmp_static_unique;
+ if (share->state.header.org_data_file_type == STATIC_RECORD &&
+ ! (share->options & HA_OPTION_NULL_FIELDS))
+ share->calc_checksum= _ma_static_checksum;
+ else
+ share->calc_checksum= _ma_checksum;
+ break;
+ case BLOCK_RECORD:
+ share->once_init= _ma_once_init_block_record;
+ share->once_end= _ma_once_end_block_record;
+ share->init= _ma_init_block_record;
+ share->end= _ma_end_block_record;
+ share->write_record_init= _ma_write_init_block_record;
+ share->write_record_abort= _ma_write_abort_block_record;
+ share->scan_init= _ma_scan_init_block_record;
+ share->scan_end= _ma_scan_end_block_record;
+ share->read_record= _ma_read_block_record;
+ share->scan= _ma_scan_block_record;
+ share->delete_record= _ma_delete_block_record;
+ share->compare_record= _ma_compare_block_record;
+ share->update_record= _ma_update_block_record;
+ share->write_record= _ma_write_block_record;
+ share->compare_unique= _ma_cmp_block_unique;
+ share->calc_checksum= _ma_checksum;
+ /*
+ write_block_record() will calculate the checksum; Tell maria_write()
+ that it doesn't have to do this.
+ */
+ share->calc_write_checksum= 0;
+ break;
+ }
+ share->file_read= _ma_nommap_pread;
+ share->file_write= _ma_nommap_pwrite;
+ share->calc_check_checksum= share->calc_checksum;
+
+ if (!(share->options & HA_OPTION_CHECKSUM) &&
+ share->data_file_type != COMPRESSED_RECORD)
+ share->calc_checksum= share->calc_write_checksum= 0;
+ return;
+}
+
+
+static void setup_key_functions(register MARIA_KEYDEF *keyinfo)
+{
+ if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+ {
+#ifdef HAVE_RTREE_KEYS
+ keyinfo->ck_insert = maria_rtree_insert;
+ keyinfo->ck_delete = maria_rtree_delete;
+#else
+ DBUG_ASSERT(0); /* maria_open should check it never happens */
+#endif
+ }
+ else
+ {
+ keyinfo->ck_insert = _ma_ck_write;
+ keyinfo->ck_delete = _ma_ck_delete;
+ }
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ { /* Simple prefix compression */
+ keyinfo->bin_search= _ma_seq_search;
+ keyinfo->get_key= _ma_get_binary_pack_key;
+ keyinfo->pack_key= _ma_calc_bin_pack_key_length;
+ keyinfo->store_key= _ma_store_bin_pack_key;
+ }
+ else if (keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ keyinfo->get_key= _ma_get_pack_key;
+ if (keyinfo->seg[0].flag & HA_PACK_KEY)
+ { /* Prefix compression */
+ if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) ||
+ (keyinfo->seg->flag & HA_NULL_PART))
+ keyinfo->bin_search= _ma_seq_search;
+ else
+ keyinfo->bin_search= _ma_prefix_search;
+ keyinfo->pack_key= _ma_calc_var_pack_key_length;
+ keyinfo->store_key= _ma_store_var_pack_key;
+ }
+ else
+ {
+ keyinfo->bin_search= _ma_seq_search;
+ keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */
+ keyinfo->store_key= _ma_store_static_key;
+ }
+ }
+ else
+ {
+ keyinfo->bin_search= _ma_bin_search;
+ keyinfo->get_key= _ma_get_static_key;
+ keyinfo->pack_key= _ma_calc_static_key_length;
+ keyinfo->store_key= _ma_store_static_key;
+ }
+ return;
+}
+
+
+/**
+ @brief Function to save and store the header in the index file (.MYI)
+
+ Operates under MARIA_SHARE::intern_lock if requested.
+ Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table.
+ Then calls _ma_state_info_write_sub().
+
+ @param share table
+ @param pWrite bitmap: if 1 is set my_pwrite() is used otherwise
+ my_write(); if 2 is set, info about keys is written
+ (should only be needed after ALTER TABLE
+ ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE); if 4 is
+ set, MARIA_SHARE::intern_lock is taken.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)
+{
+ uint res;
+ if (pWrite & 4)
+ pthread_mutex_lock(&share->intern_lock);
+ else if (maria_multi_threaded)
+ safe_mutex_assert_owner(&share->intern_lock);
+ if (share->base.born_transactional && translog_inited &&
+ !maria_in_recovery)
+ {
+ /*
+ In a recovery, we want to set is_of_horizon to the LSN of the last
+ record executed by Recovery, not the current EOF of the log (which
+ is too new). Recovery does it by itself.
+ */
+ share->state.is_of_horizon= translog_get_horizon();
+ }
+ res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite);
+ if (pWrite & 4)
+ pthread_mutex_unlock(&share->intern_lock);
+ return res;
+}
+
+
+/**
+ @brief Function to save and store the header in the index file (.MYI).
+
+ Shortcut to use instead of _ma_state_info_write() when appropriate.
+
+ @param file descriptor of the index file to write
+ @param state state information to write to the file
+ @param pWrite bitmap: if 1 is set my_pwrite() is used otherwise
+ my_write(); if 2 is set, info about keys is written
+ (should only be needed after ALTER TABLE
+ ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE).
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
+{
+ /** @todo RECOVERY write it only at checkpoint time */
+ uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+ uchar *ptr=buff;
+ uint i, keys= (uint) state->header.keys;
+ size_t res;
+ DBUG_ENTER("_ma_state_info_write");
+
+ memcpy_fixed(ptr,&state->header,sizeof(state->header));
+ ptr+=sizeof(state->header);
+
+ /* open_count must be first because of _ma_mark_file_changed ! */
+ mi_int2store(ptr,state->open_count); ptr+= 2;
+ /*
+ if you change the offset of create_rename_lsn/is_of_horizon inside the
+ index file's header, fix ma_create + ma_rename + ma_delete_all +
+ backward-compatibility.
+ */
+ lsn_store(ptr, state->create_rename_lsn); ptr+= LSN_STORE_SIZE;
+ lsn_store(ptr, state->is_of_horizon); ptr+= LSN_STORE_SIZE;
+ *ptr++= (uchar)state->changed;
+ *ptr++= state->sortkey;
+ mi_rowstore(ptr,state->state.records); ptr+= 8;
+ mi_rowstore(ptr,state->state.del); ptr+= 8;
+ mi_rowstore(ptr,state->split); ptr+= 8;
+ mi_sizestore(ptr,state->dellink); ptr+= 8;
+ mi_sizestore(ptr,state->first_bitmap_with_space); ptr+= 8;
+ mi_sizestore(ptr,state->state.key_file_length); ptr+= 8;
+ mi_sizestore(ptr,state->state.data_file_length); ptr+= 8;
+ mi_sizestore(ptr,state->state.empty); ptr+= 8;
+ mi_sizestore(ptr,state->state.key_empty); ptr+= 8;
+ mi_int8store(ptr,state->auto_increment); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->state.checksum); ptr+= 8;
+ mi_int4store(ptr,state->process); ptr+= 4;
+ mi_int4store(ptr,state->unique); ptr+= 4;
+ mi_int4store(ptr,state->status); ptr+= 4;
+ mi_int4store(ptr,state->update_count); ptr+= 4;
+
+ ptr+= state->state_diff_length;
+
+ for (i=0; i < keys; i++)
+ {
+ mi_sizestore(ptr,state->key_root[i]); ptr+= 8;
+ }
+ /** @todo RECOVERY BUG key_del is a problem for recovery */
+ mi_sizestore(ptr,state->key_del); ptr+= 8;
+ if (pWrite & 2) /* From maria_chk */
+ {
+ uint key_parts= mi_uint2korr(state->header.key_parts);
+ mi_int4store(ptr,state->sec_index_changed); ptr+= 4;
+ mi_int4store(ptr,state->sec_index_used); ptr+= 4;
+ mi_int4store(ptr,state->version); ptr+= 4;
+ mi_int8store(ptr,state->key_map); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->create_time); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->recover_time); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->check_time); ptr+= 8;
+ mi_sizestore(ptr,state->rec_per_key_rows); ptr+= 8;
+ for (i=0 ; i < key_parts ; i++)
+ {
+ mi_int4store(ptr,state->rec_per_key_part[i]); ptr+=4;
+ }
+ }
+
+ res= (pWrite & 1) ?
+ my_pwrite(file, buff, (size_t) (ptr-buff), 0L,
+ MYF(MY_NABP | MY_THREADSAFE)) :
+ my_write(file, buff, (size_t) (ptr-buff),
+ MYF(MY_NABP));
+ DBUG_RETURN(res != 0);
+}
+
+
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state)
+{
+ uint i,keys,key_parts;
+ memcpy_fixed(&state->header,ptr, sizeof(state->header));
+ ptr+= sizeof(state->header);
+ keys= (uint) state->header.keys;
+ key_parts= mi_uint2korr(state->header.key_parts);
+
+ state->open_count = mi_uint2korr(ptr); ptr+= 2;
+ state->create_rename_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE;
+ state->is_of_horizon= lsn_korr(ptr); ptr+= LSN_STORE_SIZE;
+ state->changed= (my_bool) *ptr++;
+ state->sortkey= (uint) *ptr++;
+ state->state.records= mi_rowkorr(ptr); ptr+= 8;
+ state->state.del = mi_rowkorr(ptr); ptr+= 8;
+ state->split = mi_rowkorr(ptr); ptr+= 8;
+ state->dellink= mi_sizekorr(ptr); ptr+= 8;
+ state->first_bitmap_with_space= mi_sizekorr(ptr); ptr+= 8;
+ state->state.key_file_length = mi_sizekorr(ptr); ptr+= 8;
+ state->state.data_file_length= mi_sizekorr(ptr); ptr+= 8;
+ state->state.empty = mi_sizekorr(ptr); ptr+= 8;
+ state->state.key_empty= mi_sizekorr(ptr); ptr+= 8;
+ state->auto_increment=mi_uint8korr(ptr); ptr+= 8;
+ state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8;
+ state->process= mi_uint4korr(ptr); ptr+= 4;
+ state->unique = mi_uint4korr(ptr); ptr+= 4;
+ state->status = mi_uint4korr(ptr); ptr+= 4;
+ state->update_count=mi_uint4korr(ptr); ptr+= 4;
+
+ ptr+= state->state_diff_length;
+
+ for (i=0; i < keys; i++)
+ {
+ state->key_root[i]= mi_sizekorr(ptr); ptr+= 8;
+ }
+ state->key_del= mi_sizekorr(ptr); ptr+= 8;
+ state->sec_index_changed = mi_uint4korr(ptr); ptr+= 4;
+ state->sec_index_used = mi_uint4korr(ptr); ptr+= 4;
+ state->version = mi_uint4korr(ptr); ptr+= 4;
+ state->key_map = mi_uint8korr(ptr); ptr+= 8;
+ state->create_time = (time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->recover_time =(time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->check_time = (time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->rec_per_key_rows=mi_sizekorr(ptr); ptr+= 8;
+ for (i=0 ; i < key_parts ; i++)
+ {
+ state->rec_per_key_part[i]= mi_uint4korr(ptr); ptr+=4;
+ }
+ return ptr;
+}
+
+
+/**
+ @brief Fills the state by reading its copy on disk.
+
+ Should not be called for transactional tables, as their state on disk is
+ rarely current and so is often misleading for a reader.
+ Does nothing in single user mode.
+
+ @param file file to read from
+ @param state state which will be filled
+ @param pRead if true, use my_pread(), otherwise my_read()
+*/
+
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state)
+{
+ char buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+
+ /* trick to detect transactional tables */
+ DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE);
+ if (!maria_single_user)
+ {
+ if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP)))
+ return 1;
+ _ma_state_info_read(buff, state);
+ }
+ return 0;
+}
+
+
+/****************************************************************************
+** store and read of MARIA_BASE_INFO
+****************************************************************************/
+
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
+{
+ uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff;
+
+ mi_sizestore(ptr,base->keystart); ptr+= 8;
+ mi_sizestore(ptr,base->max_data_file_length); ptr+= 8;
+ mi_sizestore(ptr,base->max_key_file_length); ptr+= 8;
+ mi_rowstore(ptr,base->records); ptr+= 8;
+ mi_rowstore(ptr,base->reloc); ptr+= 8;
+ mi_int4store(ptr,base->mean_row_length); ptr+= 4;
+ mi_int4store(ptr,base->reclength); ptr+= 4;
+ mi_int4store(ptr,base->pack_reclength); ptr+= 4;
+ mi_int4store(ptr,base->min_pack_length); ptr+= 4;
+ mi_int4store(ptr,base->max_pack_length); ptr+= 4;
+ mi_int4store(ptr,base->min_block_length); ptr+= 4;
+ mi_int2store(ptr,base->fields); ptr+= 2;
+ mi_int2store(ptr,base->fixed_not_null_fields); ptr+= 2;
+ mi_int2store(ptr,base->fixed_not_null_fields_length); ptr+= 2;
+ mi_int2store(ptr,base->max_field_lengths); ptr+= 2;
+ mi_int2store(ptr,base->pack_fields); ptr+= 2;
+ mi_int2store(ptr,0); ptr+= 2;
+ mi_int2store(ptr,base->null_bytes); ptr+= 2;
+ mi_int2store(ptr,base->original_null_bytes); ptr+= 2;
+ mi_int2store(ptr,base->field_offsets); ptr+= 2;
+ mi_int2store(ptr,base->min_row_length); ptr+= 2;
+ mi_int2store(ptr,base->block_size); ptr+= 2;
+ *ptr++= base->rec_reflength;
+ *ptr++= base->key_reflength;
+ *ptr++= base->keys;
+ *ptr++= base->auto_key;
+ *ptr++= base->born_transactional;
+ *ptr++= 0; /* Reserved */
+ mi_int2store(ptr,base->pack_bytes); ptr+= 2;
+ mi_int2store(ptr,base->blobs); ptr+= 2;
+ mi_int2store(ptr,base->max_key_block_length); ptr+= 2;
+ mi_int2store(ptr,base->max_key_length); ptr+= 2;
+ mi_int2store(ptr,base->extra_alloc_bytes); ptr+= 2;
+ *ptr++= base->extra_alloc_procent;
+ bzero(ptr,16); ptr+= 16; /* extra */
+ DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE);
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
+{
+ base->keystart= mi_sizekorr(ptr); ptr+= 8;
+ base->max_data_file_length= mi_sizekorr(ptr); ptr+= 8;
+ base->max_key_file_length= mi_sizekorr(ptr); ptr+= 8;
+ base->records= (ha_rows) mi_sizekorr(ptr); ptr+= 8;
+ base->reloc= (ha_rows) mi_sizekorr(ptr); ptr+= 8;
+ base->mean_row_length= mi_uint4korr(ptr); ptr+= 4;
+ base->reclength= mi_uint4korr(ptr); ptr+= 4;
+ base->pack_reclength= mi_uint4korr(ptr); ptr+= 4;
+ base->min_pack_length= mi_uint4korr(ptr); ptr+= 4;
+ base->max_pack_length= mi_uint4korr(ptr); ptr+= 4;
+ base->min_block_length= mi_uint4korr(ptr); ptr+= 4;
+ base->fields= mi_uint2korr(ptr); ptr+= 2;
+ base->fixed_not_null_fields= mi_uint2korr(ptr); ptr+= 2;
+ base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2;
+ base->max_field_lengths= mi_uint2korr(ptr); ptr+= 2;
+ base->pack_fields= mi_uint2korr(ptr); ptr+= 2;
+ ptr+= 2;
+ base->null_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->original_null_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->field_offsets= mi_uint2korr(ptr); ptr+= 2;
+ base->min_row_length= mi_uint2korr(ptr); ptr+= 2;
+ base->block_size= mi_uint2korr(ptr); ptr+= 2;
+
+ base->rec_reflength= *ptr++;
+ base->key_reflength= *ptr++;
+ base->keys= *ptr++;
+ base->auto_key= *ptr++;
+ base->born_transactional= *ptr++;
+ ptr++;
+ base->pack_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->blobs= mi_uint2korr(ptr); ptr+= 2;
+ base->max_key_block_length= mi_uint2korr(ptr); ptr+= 2;
+ base->max_key_length= mi_uint2korr(ptr); ptr+= 2;
+ base->extra_alloc_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->extra_alloc_procent= *ptr++;
+ ptr+= 16;
+ return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ maria_keydef
+---------------------------------------------------------------------------*/
+
+uint _ma_keydef_write(File file, MARIA_KEYDEF *keydef)
+{
+ uchar buff[MARIA_KEYDEF_SIZE];
+ uchar *ptr=buff;
+
+ *ptr++= (uchar) keydef->keysegs;
+ *ptr++= keydef->key_alg; /* Rtree or Btree */
+ mi_int2store(ptr,keydef->flag); ptr+= 2;
+ mi_int2store(ptr,keydef->block_length); ptr+= 2;
+ mi_int2store(ptr,keydef->keylength); ptr+= 2;
+ mi_int2store(ptr,keydef->minlength); ptr+= 2;
+ mi_int2store(ptr,keydef->maxlength); ptr+= 2;
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+char *_ma_keydef_read(char *ptr, MARIA_KEYDEF *keydef)
+{
+ keydef->keysegs = (uint) *ptr++;
+ keydef->key_alg = *ptr++; /* Rtree or Btree */
+
+ keydef->flag = mi_uint2korr(ptr); ptr+= 2;
+ keydef->block_length = mi_uint2korr(ptr); ptr+= 2;
+ keydef->keylength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->minlength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->maxlength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->underflow_block_length=keydef->block_length/3;
+ keydef->version = 0; /* Not saved */
+ keydef->parser = &ft_default_parser;
+ keydef->ftparser_nr = 0;
+ return ptr;
+}
+
+/***************************************************************************
+** maria_keyseg
+***************************************************************************/
+
+int _ma_keyseg_write(File file, const HA_KEYSEG *keyseg)
+{
+ uchar buff[HA_KEYSEG_SIZE];
+ uchar *ptr=buff;
+ ulong pos;
+
+ *ptr++= keyseg->type;
+ *ptr++= keyseg->language;
+ *ptr++= keyseg->null_bit;
+ *ptr++= keyseg->bit_start;
+ *ptr++= keyseg->bit_end;
+ *ptr++= keyseg->bit_length;
+ mi_int2store(ptr,keyseg->flag); ptr+= 2;
+ mi_int2store(ptr,keyseg->length); ptr+= 2;
+ mi_int4store(ptr,keyseg->start); ptr+= 4;
+ pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos;
+ mi_int4store(ptr, pos);
+ ptr+=4;
+
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+char *_ma_keyseg_read(char *ptr, HA_KEYSEG *keyseg)
+{
+ keyseg->type = *ptr++;
+ keyseg->language = *ptr++;
+ keyseg->null_bit = *ptr++;
+ keyseg->bit_start = *ptr++;
+ keyseg->bit_end = *ptr++;
+ keyseg->bit_length = *ptr++;
+ keyseg->flag = mi_uint2korr(ptr); ptr+= 2;
+ keyseg->length = mi_uint2korr(ptr); ptr+= 2;
+ keyseg->start = mi_uint4korr(ptr); ptr+= 4;
+ keyseg->null_pos = mi_uint4korr(ptr); ptr+= 4;
+ keyseg->charset=0; /* Will be filled in later */
+ if (keyseg->null_bit)
+ keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7));
+ else
+ {
+ keyseg->bit_pos= (uint16)keyseg->null_pos;
+ keyseg->null_pos= 0;
+ }
+ return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ maria_uniquedef
+---------------------------------------------------------------------------*/
+
+uint _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def)
+{
+ uchar buff[MARIA_UNIQUEDEF_SIZE];
+ uchar *ptr=buff;
+
+ mi_int2store(ptr,def->keysegs); ptr+=2;
+ *ptr++= (uchar) def->key;
+ *ptr++ = (uchar) def->null_are_equal;
+
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+char *_ma_uniquedef_read(char *ptr, MARIA_UNIQUEDEF *def)
+{
+ def->keysegs = mi_uint2korr(ptr);
+ def->key = ptr[2];
+ def->null_are_equal=ptr[3];
+ return ptr+4; /* 1 extra uchar */
+}
+
+/***************************************************************************
+** MARIA_COLUMNDEF
+***************************************************************************/
+
+uint _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef)
+{
+ uchar buff[MARIA_COLUMNDEF_SIZE];
+ uchar *ptr=buff;
+
+ mi_int6store(ptr,columndef->offset); ptr+= 6;
+ mi_int2store(ptr,columndef->type); ptr+= 2;
+ mi_int2store(ptr,columndef->length); ptr+= 2;
+ mi_int2store(ptr,columndef->fill_length); ptr+= 2;
+ mi_int2store(ptr,columndef->null_pos); ptr+= 2;
+ mi_int2store(ptr,columndef->empty_pos); ptr+= 2;
+ (*ptr++)= columndef->null_bit;
+ (*ptr++)= columndef->empty_bit;
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+char *_ma_columndef_read(char *ptr, MARIA_COLUMNDEF *columndef)
+{
+ columndef->offset= mi_uint6korr(ptr); ptr+= 6;
+ columndef->type= mi_sint2korr(ptr); ptr+= 2;
+ columndef->length= mi_uint2korr(ptr); ptr+= 2;
+ columndef->fill_length= mi_uint2korr(ptr); ptr+= 2;
+ columndef->null_pos= mi_uint2korr(ptr); ptr+= 2;
+ columndef->empty_pos= mi_uint2korr(ptr); ptr+= 2;
+ columndef->null_bit= (uint8) *ptr++;
+ columndef->empty_bit= (uint8) *ptr++;
+ return ptr;
+}
+
+/**************************************************************************
+ Open data file
+ We can't use dup() here as the data file descriptors need to have different
+ active seek-positions.
+
+ The argument file_to_dup is here for the future if there would on some OS
+ exist a dup()-like call that would give us two different file descriptors.
+*************************************************************************/
+
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share,
+ File file_to_dup __attribute__((unused)))
+{
+ info->dfile.file= share->bitmap.file.file=
+ my_open(share->data_file_name, share->mode | O_SHARE,
+ MYF(MY_WME));
+ return info->dfile.file >= 0 ? 0 : 1;
+}
+
+
+int _ma_open_keyfile(MARIA_SHARE *share)
+{
+ /*
+ Modifications to share->kfile should be under intern_lock to protect
+ against a concurrent checkpoint.
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ share->kfile.file= my_open(share->unique_file_name,
+ share->mode | O_SHARE,
+ MYF(MY_WME));
+ pthread_mutex_unlock(&share->intern_lock);
+ return (share->kfile.file < 0);
+}
+
+
+/*
+ Disable all indexes.
+
+ SYNOPSIS
+ maria_disable_indexes()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Disable all indexes.
+
+ RETURN
+ 0 ok
+*/
+
+int maria_disable_indexes(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ maria_clear_all_keys_active(share->state.key_map);
+ return 0;
+}
+
+
+/*
+ Enable all indexes
+
+ SYNOPSIS
+ maria_enable_indexes()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Enable all indexes. The indexes might have been disabled
+ by maria_disable_index() before.
+ The function works only if both data and indexes are empty,
+ otherwise a repair is required.
+ To be sure, call handler::delete_all_rows() before.
+
+ RETURN
+ 0 ok
+ HA_ERR_CRASHED data or index is non-empty.
+*/
+
+int maria_enable_indexes(MARIA_HA *info)
+{
+ int error= 0;
+ MARIA_SHARE *share= info->s;
+
+ if (share->state.state.data_file_length ||
+ (share->state.state.key_file_length != share->base.keystart))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ error= HA_ERR_CRASHED;
+ }
+ else
+ maria_set_all_keys_active(share->state.key_map, share->base.keys);
+ return error;
+}
+
+
+/*
+ Test if indexes are disabled.
+
+ SYNOPSIS
+ maria_indexes_are_disabled()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Test if indexes are disabled.
+
+ RETURN
+ 0 indexes are not disabled
+ 1 all indexes are disabled
+ 2 non-unique indexes are disabled
+*/
+
+int maria_indexes_are_disabled(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ /*
+ No keys or all are enabled. keys is the number of keys. Left shifted
+ gives us only one bit set. When decreased by one, gives us all all bits
+ up to this one set and it gets unset.
+ */
+ if (!share->base.keys ||
+ (maria_is_all_keys_active(share->state.key_map, share->base.keys)))
+ return 0;
+
+ /* All are disabled */
+ if (maria_is_any_key_active(share->state.key_map))
+ return 1;
+
+ /*
+ We have keys. Some enabled, some disabled.
+ Don't check for any non-unique disabled but return directly 2
+ */
+ return 2;
+}
+
+
+static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused)))
+{
+ return 0;
+}
+
+static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused)))
+{
+}
+
+static my_bool maria_once_init_dummy(MARIA_SHARE *share
+ __attribute__((unused)),
+ File dfile __attribute__((unused)))
+{
+ return 0;
+}
+
+static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused)))
+{
+ return 0;
+}
diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c
new file mode 100644
index 00000000000..173fafaf73f
--- /dev/null
+++ b/storage/maria/ma_packrec.c
@@ -0,0 +1,1717 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+ /* Functions to compressed records */
+
+#include "maria_def.h"
+
+#define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */
+
+/* Some definitions to keep in sync with maria_pack.c */
+#define HEAD_LENGTH 32 /* Length of fixed header */
+
+#if INT_MAX > 32767
+#define BITS_SAVED 32
+#define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */
+#else
+#define BITS_SAVED 16
+#define MAX_QUICK_TABLE_BITS 6
+#endif
+
+#define get_bit(BU) ((BU)->bits ? \
+ (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\
+ (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\
+ (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1))))
+#define skip_to_next_byte(BU) ((BU)->bits&=~7)
+#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count))
+
+#define decode_bytes_test_bit(bit) \
+ if (low_byte & (1 << (7-bit))) \
+ pos++; \
+ if (*pos & IS_CHAR) \
+ { bits-=(bit+1); break; } \
+ pos+= *pos
+
+/* Size in uint16 of a Huffman tree for uchar compression of 256 uchar values. */
+#define OFFSET_TABLE_SIZE 512
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+ pbool fix_keys);
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree,
+ uint16 **decode_table,uchar **intervall_buff,
+ uint16 *tmp_buff);
+static void make_quick_table(uint16 *to_table,uint16 *decode_table,
+ uint *next_free,uint value,uint bits,
+ uint max_bits);
+static void fill_quick_table(uint16 *table,uint bits, uint max_bits,
+ uint value);
+static uint copy_decode_table(uint16 *to_pos,uint offset,
+ uint16 *decode_table);
+static uint find_longest_bitstream(uint16 *table, uint16 *end);
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field,
+ MARIA_BIT_BUFF *buff,
+ uchar *to,
+ uchar *end);
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree);
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer,
+ uint length);
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count);
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff);
+static uint max_bit(uint value);
+static uint read_pack_length(uint version, const uchar *buf, ulong *length);
+#ifdef HAVE_MMAP
+static uchar *_ma_mempack_get_block_info(MARIA_HA *maria,
+ MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p,
+ size_t *rec_buff_size_p,
+ uchar *header);
+#endif
+
+static maria_bit_type mask[]=
+{
+ 0x00000000,
+ 0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+ 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+ 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+ 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+#if BITS_SAVED > 16
+ 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
+ 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
+ 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
+ 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff,
+#endif
+};
+
+
+my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile)
+{
+ share->options|= HA_OPTION_READ_ONLY_DATA;
+ return (_ma_read_pack_info(share, dfile,
+ (pbool)
+ test(!(share->options &
+ (HA_OPTION_PACK_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD)))));
+}
+
+
+my_bool _ma_once_end_pack_row(MARIA_SHARE *share)
+{
+ if (share->decode_trees)
+ {
+ my_free((uchar*) share->decode_trees,MYF(0));
+ my_free((uchar*) share->decode_tables,MYF(0));
+ }
+ return 0;
+}
+
+
+/* Read all packed info, allocate memory and fix field structs */
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+ pbool fix_keys)
+{
+ int diff_length;
+ uint i,trees,huff_tree_bits,rec_reflength,length;
+ uint16 *decode_table,*tmp_buff;
+ ulong elements,intervall_length;
+ char *disk_cache;
+ uchar *intervall_buff;
+ uchar header[HEAD_LENGTH];
+ MARIA_BIT_BUFF bit_buff;
+ DBUG_ENTER("_ma_read_pack_info");
+
+ if (maria_quick_table_bits < 4)
+ maria_quick_table_bits=4;
+ else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS)
+ maria_quick_table_bits=MAX_QUICK_TABLE_BITS;
+
+ my_errno=0;
+ if (my_read(file,(uchar*) header,sizeof(header),MYF(MY_NABP)))
+ {
+ if (!my_errno)
+ my_errno=HA_ERR_END_OF_FILE;
+ goto err0;
+ }
+ /* Only the first three bytes of magic number are independent of version. */
+ if (memcmp((uchar*) header, (uchar*) maria_pack_file_magic, 3))
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err0;
+ }
+ share->pack.version= header[3]; /* fourth uchar of magic number */
+ share->pack.header_length= uint4korr(header+4);
+ share->min_pack_length=(uint) uint4korr(header+8);
+ share->max_pack_length=(uint) uint4korr(header+12);
+ set_if_bigger(share->base.pack_reclength,share->max_pack_length);
+ elements=uint4korr(header+16);
+ intervall_length=uint4korr(header+20);
+ trees=uint2korr(header+24);
+ share->pack.ref_length=header[26];
+ rec_reflength=header[27];
+ diff_length=(int) rec_reflength - (int) share->base.rec_reflength;
+ if (fix_keys)
+ share->rec_reflength=rec_reflength;
+ share->base.min_block_length=share->min_pack_length+1;
+ if (share->min_pack_length > 254)
+ share->base.min_block_length+=2;
+ DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH));
+ DBUG_PRINT("info", ("total header length: %lu", share->pack.header_length));
+ DBUG_PRINT("info", ("pack file version: %u", share->pack.version));
+ DBUG_PRINT("info", ("min pack length: %lu", share->min_pack_length));
+ DBUG_PRINT("info", ("max pack length: %lu", share->max_pack_length));
+ DBUG_PRINT("info", ("elements of all trees: %lu", elements));
+ DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length));
+ DBUG_PRINT("info", ("number of code trees: %u", trees));
+ DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length));
+ DBUG_PRINT("info", ("record pointer length: %u", rec_reflength));
+
+
+ /*
+ Memory segment #1:
+ - Decode tree heads
+ - Distinct column values
+ */
+ if (!(share->decode_trees=(MARIA_DECODE_TREE*)
+ my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+
+ intervall_length*sizeof(uchar)),
+ MYF(MY_WME))))
+ goto err0;
+ intervall_buff=(uchar*) (share->decode_trees+trees);
+
+ /*
+ Memory segment #2:
+ - Decode tables
+ - Quick decode tables
+ - Temporary decode table
+ - Compressed data file header cache
+ This segment will be reallocated after construction of the tables.
+ */
+ length=(uint) (elements*2+trees*(1 << maria_quick_table_bits));
+ if (!(share->decode_tables=(uint16*)
+ my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+
+ (uint) (share->pack.header_length - sizeof(header)),
+ MYF(MY_WME | MY_ZEROFILL))))
+ goto err1;
+ tmp_buff=share->decode_tables+length;
+ disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE);
+
+ if (my_read(file,disk_cache,
+ (uint) (share->pack.header_length-sizeof(header)),
+ MYF(MY_NABP)))
+ goto err2;
+
+ huff_tree_bits=max_bit(trees ? trees-1 : 0);
+ init_bit_buffer(&bit_buff, (uchar*) disk_cache,
+ (uint) (share->pack.header_length-sizeof(header)));
+ /* Read new info for each field */
+ for (i=0 ; i < share->base.fields ; i++)
+ {
+ share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5);
+ share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6);
+ share->columndef[i].space_length_bits=get_bits(&bit_buff,5);
+ share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff,
+ huff_tree_bits);
+ share->columndef[i].unpack= get_unpack_function(share->columndef + i);
+ DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u",
+ i, share->columndef[i].base_type,
+ share->columndef[i].pack_type,
+ share->columndef[i].space_length_bits));
+ }
+ skip_to_next_byte(&bit_buff);
+ /*
+ Construct the decoding tables from the file header. Keep track of
+ the used memory.
+ */
+ decode_table=share->decode_tables;
+ for (i=0 ; i < trees ; i++)
+ if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table,
+ &intervall_buff,tmp_buff))
+ goto err3;
+ /* Reallocate the decoding tables to the used size. */
+ decode_table=(uint16*)
+ my_realloc((uchar*) share->decode_tables,
+ (uint) ((uchar*) decode_table - (uchar*) share->decode_tables),
+ MYF(MY_HOLD_ON_ERROR));
+ /* Fix the table addresses in the tree heads. */
+ {
+ long diff=PTR_BYTE_DIFF(decode_table,share->decode_tables);
+ share->decode_tables=decode_table;
+ for (i=0 ; i < trees ; i++)
+ share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table,
+ diff, uint16*);
+ }
+
+ /* Fix record-ref-length for keys */
+ if (fix_keys)
+ {
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ MARIA_KEYDEF *keyinfo= &share->keyinfo[i];
+ keyinfo->keylength+= (uint16) diff_length;
+ keyinfo->minlength+= (uint16) diff_length;
+ keyinfo->maxlength+= (uint16) diff_length;
+ keyinfo->seg[keyinfo->flag & HA_FULLTEXT ?
+ FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength;
+ }
+ if (share->ft2_keyinfo.seg)
+ {
+ MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo;
+ ft2_keyinfo->keylength+= (uint16) diff_length;
+ ft2_keyinfo->minlength+= (uint16) diff_length;
+ ft2_keyinfo->maxlength+= (uint16) diff_length;
+ }
+ }
+
+ if (bit_buff.error || bit_buff.pos < bit_buff.end)
+ goto err3;
+
+ DBUG_RETURN(0);
+
+err3:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+err2:
+ my_free((uchar*) share->decode_tables,MYF(0));
+err1:
+ my_free((uchar*) share->decode_trees,MYF(0));
+err0:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Read a huff-code-table from datafile.
+
+ SYNOPSIS
+ read_huff_table()
+ bit_buff Bit buffer pointing at start of the
+ decoding table in the file header cache.
+ decode_tree Pointer to the decode tree head.
+ decode_table IN/OUT Address of a pointer to the next free space.
+ intervall_buff IN/OUT Address of a pointer to the next unused values.
+ tmp_buff Buffer for temporary extraction of a full
+ decoding table as read from bit_buff.
+
+ RETURN
+ 0 OK.
+ 1 Error.
+*/
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree,
+ uint16 **decode_table, uchar **intervall_buff,
+ uint16 *tmp_buff)
+{
+ uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits,
+ next_free_offset;
+ uint16 *ptr,*end;
+ DBUG_ENTER("read_huff_table");
+
+ if (!get_bits(bit_buff,1))
+ {
+ /* Byte value compression. */
+ min_chr=get_bits(bit_buff,8);
+ elements=get_bits(bit_buff,9);
+ char_bits=get_bits(bit_buff,5);
+ offset_bits=get_bits(bit_buff,5);
+ intervall_length=0;
+ ptr=tmp_buff;
+ ptr=tmp_buff;
+ DBUG_PRINT("info", ("byte value compression"));
+ DBUG_PRINT("info", ("minimum uchar value: %u", min_chr));
+ DBUG_PRINT("info", ("number of tree nodes: %u", elements));
+ DBUG_PRINT("info", ("bits for values: %u", char_bits));
+ DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+ if (elements > 256)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u",
+ elements));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ /* Distinct column value compression. */
+ min_chr=0;
+ elements=get_bits(bit_buff,15);
+ intervall_length=get_bits(bit_buff,16);
+ char_bits=get_bits(bit_buff,5);
+ offset_bits=get_bits(bit_buff,5);
+ decode_tree->quick_table_bits=0;
+ ptr= *decode_table;
+ DBUG_PRINT("info", ("distinct column value compression"));
+ DBUG_PRINT("info", ("number of tree nodes: %u", elements));
+ DBUG_PRINT("info", ("value buffer length: %u", intervall_length));
+ DBUG_PRINT("info", ("bits for value index: %u", char_bits));
+ DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+ }
+ size=elements*2-2;
+ DBUG_PRINT("info", ("tree size in uint16: %u", size));
+ DBUG_PRINT("info", ("tree size in bytes: %u",
+ size * (uint) sizeof(uint16)));
+
+ for (end=ptr+size ; ptr < end ; ptr++)
+ {
+ if (get_bit(bit_buff))
+ {
+ *ptr= (uint16) get_bits(bit_buff,offset_bits);
+ if ((ptr + *ptr >= end) || !*ptr)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr));
+ }
+ skip_to_next_byte(bit_buff);
+
+ decode_tree->table= *decode_table;
+ decode_tree->intervalls= *intervall_buff;
+ if (! intervall_length)
+ {
+ /* Byte value compression. ptr started from tmp_buff. */
+ /* Find longest Huffman code from begin to end of tree in bits. */
+ table_bits= find_longest_bitstream(tmp_buff, ptr);
+ if (table_bits >= OFFSET_TABLE_SIZE)
+ DBUG_RETURN(1);
+ if (table_bits > maria_quick_table_bits)
+ table_bits=maria_quick_table_bits;
+ DBUG_PRINT("info", ("table bits: %u", table_bits));
+
+ next_free_offset= (1 << table_bits);
+ make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits,
+ table_bits);
+ (*decode_table)+= next_free_offset;
+ decode_tree->quick_table_bits=table_bits;
+ }
+ else
+ {
+ /* Distinct column value compression. ptr started from *decode_table */
+ (*decode_table)=end;
+ /*
+ get_bits() moves some bytes to a cache buffer in advance. May need
+ to step back.
+ */
+ bit_buff->pos-= bit_buff->bits/8;
+ /* Copy the distinct column values from the buffer. */
+ memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length);
+ (*intervall_buff)+=intervall_length;
+ bit_buff->pos+=intervall_length;
+ bit_buff->bits=0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Make a quick_table for faster decoding.
+
+ SYNOPSIS
+ make_quick_table()
+ to_table Target quick_table and remaining decode table.
+ decode_table Source Huffman (sub-)tree within tmp_buff.
+ next_free_offset IN/OUT Next free offset from to_table.
+ Starts behind quick_table on the top-level.
+ value Huffman bits found so far.
+ bits Remaining bits to be collected.
+ max_bits Total number of bits to collect (table_bits).
+
+ DESCRIPTION
+
+ The quick table is an array of 16-bit values. There exists one value
+ for each possible code representable by max_bits (table_bits) bits.
+ In most cases table_bits is 9. So there are 512 16-bit values.
+
+ If the high-order bit (16) is set (IS_CHAR) then the array slot for
+ this value is a valid Huffman code for a resulting uchar value.
+
+ The low-order 8 bits (1..8) are the resulting uchar value.
+
+ Bits 9..14 are the length of the Huffman code for this uchar value.
+ This means so many bits from the input stream were needed to
+ represent this uchar value. The remaining bits belong to later
+ Huffman codes. This also means that for every Huffman code shorter
+ than table_bits there are multiple entires in the array, which
+ differ just in the unused bits.
+
+ If the high-order bit (16) is clear (0) then the remaining bits are
+ the position of the remaining Huffman decode tree segment behind the
+ quick table.
+
+ RETURN
+ void
+*/
+
+static void make_quick_table(uint16 *to_table, uint16 *decode_table,
+ uint *next_free_offset, uint value, uint bits,
+ uint max_bits)
+{
+ DBUG_ENTER("make_quick_table");
+
+ /*
+ When down the table to the requested maximum, copy the rest of the
+ Huffman table.
+ */
+ if (!bits--)
+ {
+ /*
+ Remaining left Huffman tree segment starts behind quick table.
+ Remaining right Huffman tree segment starts behind left segment.
+ */
+ to_table[value]= (uint16) *next_free_offset;
+ /*
+ Re-construct the remaining Huffman tree segment at
+ next_free_offset in to_table.
+ */
+ *next_free_offset=copy_decode_table(to_table, *next_free_offset,
+ decode_table);
+ DBUG_VOID_RETURN;
+ }
+
+ /* Descent on the left side. Left side bits are clear (0). */
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Not a leaf. Follow the pointer. */
+ make_quick_table(to_table,decode_table+ *decode_table,
+ next_free_offset,value,bits,max_bits);
+ }
+ else
+ {
+ /*
+ A leaf. A Huffman code is complete. Fill the quick_table
+ array for all possible bit strings starting with this Huffman
+ code.
+ */
+ fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+ }
+
+ /* Descent on the right side. Right side bits are set (1). */
+ decode_table++;
+ value|= (1 << bits);
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Not a leaf. Follow the pointer. */
+ make_quick_table(to_table,decode_table+ *decode_table,
+ next_free_offset,value,bits,max_bits);
+ }
+ else
+ {
+ /*
+ A leaf. A Huffman code is complete. Fill the quick_table
+ array for all possible bit strings starting with this Huffman
+ code.
+ */
+ fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Fill quick_table for all possible values starting with this Huffman code.
+
+ SYNOPSIS
+ fill_quick_table()
+ table Target quick_table position.
+ bits Unused bits from max_bits.
+ max_bits Total number of bits to collect (table_bits).
+ value The uchar encoded by the found Huffman code.
+
+ DESCRIPTION
+
+ Fill the segment (all slots) of the quick_table array with the
+ resulting value for the found Huffman code. There are as many slots
+ as there are combinations representable by the unused bits.
+
+ In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then
+ there are 6 unused bits. Hence we fill 2**6 = 64 slots with the
+ value.
+
+ RETURN
+ void
+*/
+
+static void fill_quick_table(uint16 *table, uint bits, uint max_bits,
+ uint value)
+{
+ uint16 *end;
+ DBUG_ENTER("fill_quick_table");
+
+ /*
+ Bits 1..8 of value represent the decoded uchar value.
+ Bits 9..14 become the length of the Huffman code for this uchar value.
+ Bit 16 flags a valid code (IS_CHAR).
+ */
+ value|= (max_bits - bits) << 8 | IS_CHAR;
+
+ for (end= table + (uint) (((uint) 1 << bits)); table < end; table++)
+ {
+ *table= (uint16) value;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Reconstruct a decode subtree at the target position.
+
+ SYNOPSIS
+ copy_decode_table()
+ to_pos Target quick_table and remaining decode table.
+ offset Next free offset from to_pos.
+ decode_table Source Huffman subtree within tmp_buff.
+
+ NOTE
+ Pointers in the decode tree are relative to the pointers position.
+
+ RETURN
+ next free offset from to_pos.
+*/
+
+static uint copy_decode_table(uint16 *to_pos, uint offset,
+ uint16 *decode_table)
+{
+ uint prev_offset= offset;
+ DBUG_ENTER("copy_decode_table");
+
+ /* Descent on the left side. */
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Set a pointer to the next target node. */
+ to_pos[offset]=2;
+ /* Copy the left hand subtree there. */
+ offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table);
+ }
+ else
+ {
+ /* Copy the uchar value. */
+ to_pos[offset]= *decode_table;
+ /* Step behind this node. */
+ offset+=2;
+ }
+
+ /* Descent on the right side. */
+ decode_table++;
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Set a pointer to the next free target node. */
+ to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1);
+ /* Copy the right hand subtree to the entry of that node. */
+ offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table);
+ }
+ else
+ {
+ /* Copy the uchar value. */
+ to_pos[prev_offset+1]= *decode_table;
+ }
+ DBUG_RETURN(offset);
+}
+
+
+/*
+ Find the length of the longest Huffman code in this table in bits.
+
+ SYNOPSIS
+ find_longest_bitstream()
+ table Code (sub-)table start.
+ end End of code table.
+
+ IMPLEMENTATION
+
+ Recursively follow the branch(es) of the code pair on every level of
+ the tree until two uchar values (and no branch) are found. Add one to
+ each level when returning back from each recursion stage.
+
+ 'end' is used for error checking only. A clean tree terminates
+ before reaching 'end'. Hence the exact value of 'end' is not too
+ important. However having it higher than necessary could lead to
+ misbehaviour should 'next' jump into the dirty area.
+
+ RETURN
+ length Length of longest Huffman code in bits.
+ >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'.
+*/
+
+static uint find_longest_bitstream(uint16 *table, uint16 *end)
+{
+ uint length=1;
+ uint length2;
+ if (!(*table & IS_CHAR))
+ {
+ uint16 *next= table + *table;
+ if (next > end || next == table)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ return OFFSET_TABLE_SIZE;
+ }
+ length=find_longest_bitstream(next, end)+1;
+ }
+ table++;
+ if (!(*table & IS_CHAR))
+ {
+ uint16 *next= table + *table;
+ if (next > end || next == table)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ return OFFSET_TABLE_SIZE;
+ }
+ length2= find_longest_bitstream(next, end) + 1;
+ length=max(length,length2);
+ }
+ return length;
+}
+
+
+/*
+ Read record from datafile.
+
+ SYNOPSIS
+ _ma_read_pack_record()
+ info A pointer to MARIA_HA.
+ filepos File offset of the record.
+ buf RETURN The buffer to receive the record.
+
+ RETURN
+ 0 On success
+ # Error number
+*/
+
+int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ File file;
+ DBUG_ENTER("maria_read_pack_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno); /* _search() didn't find record */
+
+ file= info->dfile.file;
+ if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size, file,
+ filepos))
+ goto err;
+ if (my_read(file,(uchar*) info->rec_buff + block_info.offset ,
+ block_info.rec_len - block_info.offset, MYF(MY_NABP)))
+ goto panic;
+ info->update|= HA_STATE_AKTIV;
+ DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf,
+ info->rec_buff, block_info.rec_len));
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+err:
+ DBUG_RETURN(my_errno);
+}
+
+
+
+int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+ register uchar *to, uchar *from, ulong reclength)
+{
+ uchar *end_field;
+ reg3 MARIA_COLUMNDEF *end;
+ MARIA_COLUMNDEF *current_field;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_pack_rec_unpack");
+
+ if (info->s->base.null_bytes)
+ {
+ memcpy(to, from, info->s->base.null_bytes);
+ to+= info->s->base.null_bytes;
+ from+= info->s->base.null_bytes;
+ reclength-= info->s->base.null_bytes;
+ }
+ init_bit_buffer(bit_buff, (uchar*) from, reclength);
+ for (current_field=share->columndef, end=current_field+share->base.fields ;
+ current_field < end ;
+ current_field++,to=end_field)
+ {
+ end_field=to+current_field->length;
+ (*current_field->unpack)(current_field, bit_buff, to, end_field);
+ }
+ if (!bit_buff->error &&
+ bit_buff->pos - bit_buff->bits / 8 == bit_buff->end)
+ DBUG_RETURN(0);
+ info->update&= ~HA_STATE_AKTIV;
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_IN_RECORD);
+} /* _ma_pack_rec_unpack */
+
+
+ /* Return function to unpack field */
+
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))
+ (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *)
+{
+ switch (rec->base_type) {
+ case FIELD_SKIP_ZERO:
+ if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+ return &uf_zerofill_skip_zero;
+ return &uf_skip_zero;
+ case FIELD_NORMAL:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ return &uf_space_normal;
+ if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+ return &uf_zerofill_normal;
+ return &decode_bytes;
+ case FIELD_SKIP_ENDSPACE:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_space_endspace_selected;
+ return &uf_space_endspace;
+ }
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_endspace_selected;
+ return &uf_endspace;
+ case FIELD_SKIP_PRESPACE:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_space_prespace_selected;
+ return &uf_space_prespace;
+ }
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_prespace_selected;
+ return &uf_prespace;
+ case FIELD_CONSTANT:
+ return &uf_constant;
+ case FIELD_INTERVALL:
+ return &uf_intervall;
+ case FIELD_ZERO:
+ case FIELD_CHECK:
+ return &uf_zero;
+ case FIELD_BLOB:
+ return &uf_blob;
+ case FIELD_VARCHAR:
+ if (rec->length <= 256) /* 255 + 1 uchar length */
+ return &uf_varchar1;
+ return &uf_varchar2;
+ case FIELD_LAST:
+ default:
+ return 0; /* This should never happend */
+ }
+}
+
+ /* The different functions to unpack a field */
+
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero((char*) to,(uint) (end-to));
+ else
+ {
+ end-=rec->space_length_bits;
+ decode_bytes(rec,bit_buff,to,end);
+ bzero((char*) end,rec->space_length_bits);
+ }
+}
+
+static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero((char*) to,(uint) (end-to));
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bfill((uchar*) to,(end-to),' ');
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill((uchar*) to,(end-to),' ');
+ else
+ {
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill((uchar*) end-spaces,spaces,' ');
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+ }
+}
+
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill((uchar*) end-spaces,spaces,' ');
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill((uchar*) to,(end-to),' ');
+ else
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill((uchar*) end-spaces,spaces,' ');
+ }
+}
+
+static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill((uchar*) end-spaces,spaces,' ');
+}
+
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill((uchar*) to,(end-to),' ');
+ else
+ {
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill((uchar*) to,spaces,' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+ }
+}
+
+
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill((uchar*) to,spaces,' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+
+static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill((uchar*) to,(end-to),' ');
+ else
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill((uchar*) to,spaces,' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+}
+
+static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill((uchar*) to,spaces,' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+}
+
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ end-=rec->space_length_bits;
+ decode_bytes(rec,bit_buff, to, end);
+ bzero((char*) end,rec->space_length_bits);
+}
+
+static void uf_constant(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+ uchar *to, uchar *end)
+{
+ memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to));
+}
+
+static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to,
+ uchar *end)
+{
+ reg1 uint field_length=(uint) (end-to);
+ memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff,
+ rec->huff_tree),
+ (size_t) field_length);
+}
+
+
+/*ARGSUSED*/
+static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)),
+ MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+ uchar *to, uchar *end)
+{
+ bzero(to, (uint) (end-to));
+}
+
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero(to, (uint) (end-to));
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr;
+ if (bit_buff->blob_pos+length > bit_buff->blob_end)
+ {
+ bit_buff->error=1;
+ bzero((uchar*) to,(end-to));
+ return;
+ }
+ decode_bytes(rec,bit_buff,(uchar*) bit_buff->blob_pos,
+ (uchar*) bit_buff->blob_pos+length);
+ _ma_store_blob_length((uchar*) to,pack_length,length);
+ memcpy_fixed((char*) to+pack_length,(char*) &bit_buff->blob_pos,
+ sizeof(char*));
+ bit_buff->blob_pos+=length;
+ }
+}
+
+
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end __attribute__((unused)))
+{
+ if (get_bit(bit_buff))
+ to[0]= 0; /* Zero lengths */
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ *to= (char) length;
+ decode_bytes(rec,bit_buff,to+1,to+1+length);
+ }
+}
+
+
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end __attribute__((unused)))
+{
+ if (get_bit(bit_buff))
+ to[0]=to[1]=0; /* Zero lengths */
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ int2store(to,length);
+ decode_bytes(rec,bit_buff,to+2,to+2+length);
+ }
+}
+
+ /* Functions to decode of buffer of bits */
+
+#if BITS_SAVED == 64
+
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ reg1 uint bits,low_byte;
+ reg3 uint16 *pos;
+ reg4 uint table_bits,table_and;
+ MARIA_DECODE_TREE *decode_tree;
+
+ decode_tree=rec->decode_tree;
+ bits=bit_buff->bits; /* Save in reg for quicker access */
+ table_bits=decode_tree->quick_table_bits;
+ table_and= (1 << table_bits)-1;
+
+ do
+ {
+ if (bits <= 32)
+ {
+ if (bit_buff->pos > bit_buff->end+4)
+ {
+ bit_buff->error=1;
+ return; /* Can't be right */
+ }
+ bit_buff->current_byte= (bit_buff->current_byte << 32) +
+ ((((uint) bit_buff->pos[3])) +
+ (((uint) bit_buff->pos[2]) << 8) +
+ (((uint) bit_buff->pos[1]) << 16) +
+ (((uint) bit_buff->pos[0]) << 24));
+ bit_buff->pos+=4;
+ bits+=32;
+ }
+ /*
+ First use info in quick_table.
+
+ The quick table is an array of 16-bit values. There exists one
+ value for each possible code representable by table_bits bits.
+ In most cases table_bits is 9. So there are 512 16-bit values.
+
+ If the high-order bit (16) is set (IS_CHAR) then the array slot
+ for this value is a valid Huffman code for a resulting uchar value.
+
+ The low-order 8 bits (1..8) are the resulting uchar value.
+
+ Bits 9..14 are the length of the Huffman code for this uchar value.
+ This means so many bits from the input stream were needed to
+ represent this uchar value. The remaining bits belong to later
+ Huffman codes. This also means that for every Huffman code shorter
+ than table_bits there are multiple entires in the array, which
+ differ just in the unused bits.
+
+ If the high-order bit (16) is clear (0) then the remaining bits are
+ the position of the remaining Huffman decode tree segment behind the
+ quick table.
+ */
+ low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and;
+ low_byte=decode_tree->table[low_byte];
+ if (low_byte & IS_CHAR)
+ {
+ /*
+ All Huffman codes of less or equal table_bits length are in the
+ quick table. This is one of them.
+ */
+ *to++ = (char) (low_byte & 255); /* Found char in quick table */
+ bits-= ((low_byte >> 8) & 31); /* Remove bits used */
+ }
+ else
+ { /* Map through rest of decode-table */
+ /* This means that the Huffman code must be longer than table_bits. */
+ pos=decode_tree->table+low_byte;
+ bits-=table_bits;
+ /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */
+ for (;;)
+ {
+ low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+ decode_bytes_test_bit(0);
+ decode_bytes_test_bit(1);
+ decode_bytes_test_bit(2);
+ decode_bytes_test_bit(3);
+ decode_bytes_test_bit(4);
+ decode_bytes_test_bit(5);
+ decode_bytes_test_bit(6);
+ decode_bytes_test_bit(7);
+ bits-=8;
+ }
+ *to++ = (char) *pos;
+ }
+ } while (to != end);
+
+ bit_buff->bits=bits;
+ return;
+}
+
+#else
+
+static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ reg1 uint bits,low_byte;
+ reg3 uint16 *pos;
+ reg4 uint table_bits,table_and;
+ MARIA_DECODE_TREE *decode_tree;
+
+ decode_tree=rec->huff_tree;
+ bits=bit_buff->bits; /* Save in reg for quicker access */
+ table_bits=decode_tree->quick_table_bits;
+ table_and= (1 << table_bits)-1;
+
+ do
+ {
+ if (bits < table_bits)
+ {
+ if (bit_buff->pos > bit_buff->end+1)
+ {
+ bit_buff->error=1;
+ return; /* Can't be right */
+ }
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (bit_buff->current_byte << 24) +
+ (((uint) ((uchar) bit_buff->pos[2]))) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 16);
+ bit_buff->pos+=3;
+ bits+=24;
+#else
+ if (bits) /* We must have at leasts 9 bits */
+ {
+ bit_buff->current_byte= (bit_buff->current_byte << 8) +
+ (uint) ((uchar) bit_buff->pos[0]);
+ bit_buff->pos++;
+ bits+=8;
+ }
+ else
+ {
+ bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) +
+ ((uint) ((uchar) bit_buff->pos[1]));
+ bit_buff->pos+=2;
+ bits+=16;
+ }
+#endif
+ }
+ /* First use info in quick_table */
+ low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and;
+ low_byte=decode_tree->table[low_byte];
+ if (low_byte & IS_CHAR)
+ {
+ *to++ = (low_byte & 255); /* Found char in quick table */
+ bits-= ((low_byte >> 8) & 31); /* Remove bits used */
+ }
+ else
+ { /* Map through rest of decode-table */
+ pos=decode_tree->table+low_byte;
+ bits-=table_bits;
+ for (;;)
+ {
+ if (bits < 8)
+ { /* We don't need to check end */
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (bit_buff->current_byte << 24) +
+ (((uint) ((uchar) bit_buff->pos[2]))) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 16);
+ bit_buff->pos+=3;
+ bits+=24;
+#else
+ bit_buff->current_byte= (bit_buff->current_byte << 8) +
+ (uint) ((uchar) bit_buff->pos[0]);
+ bit_buff->pos+=1;
+ bits+=8;
+#endif
+ }
+ low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+ decode_bytes_test_bit(0);
+ decode_bytes_test_bit(1);
+ decode_bytes_test_bit(2);
+ decode_bytes_test_bit(3);
+ decode_bytes_test_bit(4);
+ decode_bytes_test_bit(5);
+ decode_bytes_test_bit(6);
+ decode_bytes_test_bit(7);
+ bits-=8;
+ }
+ *to++ = (char) *pos;
+ }
+ } while (to != end);
+
+ bit_buff->bits=bits;
+ return;
+}
+#endif /* BIT_SAVED == 64 */
+
+
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree)
+{
+ uint16 *pos=decode_tree->table;
+ for (;;)
+ {
+ if (get_bit(bit_buff))
+ pos++;
+ if (*pos & IS_CHAR)
+ return (uint) (*pos & ~IS_CHAR);
+ pos+= *pos;
+ }
+}
+
+
+int _ma_read_rnd_pack_record(MARIA_HA *info,
+ uchar *buf,
+ register MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ File file;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_read_rnd_pack_record");
+
+ if (filepos >= info->state->data_file_length)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ goto err;
+ }
+
+ file= info->dfile.file;
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, (uchar*) block_info.header,
+ filepos, share->pack.ref_length,
+ skip_deleted_blocks ? READING_NEXT : 0))
+ goto err;
+ file= -1;
+ }
+ if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size,
+ file, filepos))
+ goto err; /* Error code is already set */
+#ifndef DBUG_OFF
+ if (block_info.rec_len > share->max_pack_length)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+#endif
+
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, (uchar*) info->rec_buff,
+ block_info.filepos, block_info.rec_len,
+ skip_deleted_blocks ? READING_NEXT : 0))
+ goto err;
+ }
+ else
+ {
+ if (my_read(info->dfile.file, (uchar*)info->rec_buff + block_info.offset,
+ block_info.rec_len-block_info.offset,
+ MYF(MY_NABP)))
+ goto err;
+ }
+ info->packed_length= block_info.rec_len;
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= block_info.filepos+block_info.rec_len;
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+ DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ info->rec_buff, block_info.rec_len));
+ err:
+ DBUG_RETURN(my_errno);
+}
+
+
+ /* Read and process header from a huff-record-file */
+
+uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p, size_t *rec_buff_size_p,
+ File file, my_off_t filepos)
+{
+ uchar *header= info->header;
+ uint head_length,ref_length;
+ LINT_INIT(ref_length);
+
+ if (file >= 0)
+ {
+ ref_length=maria->s->pack.ref_length;
+ /*
+ We can't use my_pread() here because _ma_read_rnd_pack_record assumes
+ position is ok
+ */
+ VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+ if (my_read(file,(char*) header,ref_length,MYF(MY_NABP)))
+ return BLOCK_FATAL_ERROR;
+ DBUG_DUMP("header",(uchar*) header,ref_length);
+ }
+ head_length= read_pack_length((uint) maria->s->pack.version, header,
+ &info->rec_len);
+ if (maria->s->base.blobs)
+ {
+ head_length+= read_pack_length((uint) maria->s->pack.version,
+ header + head_length, &info->blob_len);
+ /*
+ Ensure that the record buffer is big enough for the compressed
+ record plus all expanded blobs. [We do not have an extra buffer
+ for the resulting blobs. Sigh.]
+ */
+ if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+ info->rec_len + info->blob_len +
+ maria->s->base.extra_rec_buff_size))
+ return BLOCK_FATAL_ERROR; /* not enough memory */
+ bit_buff->blob_pos= (uchar*) *rec_buff_p + info->rec_len;
+ bit_buff->blob_end= bit_buff->blob_pos + info->blob_len;
+ maria->blob_length=info->blob_len;
+ }
+ info->filepos=filepos+head_length;
+ if (file > 0)
+ {
+ info->offset=min(info->rec_len, ref_length - head_length);
+ memcpy(*rec_buff_p, header + head_length, info->offset);
+ }
+ return 0;
+}
+
+
+ /* rutines for bit buffer */
+ /* Note buffer must be 6 uchar bigger than longest row */
+
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer,
+ uint length)
+{
+ bit_buff->pos=buffer;
+ bit_buff->end=buffer+length;
+ bit_buff->bits=bit_buff->error=0;
+ bit_buff->current_byte=0; /* Avoid purify errors */
+}
+
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count)
+{
+ uint tmp;
+ count-=bit_buff->bits;
+ tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count;
+ fill_buffer(bit_buff);
+ bit_buff->bits=BITS_SAVED - count;
+ return tmp+(bit_buff->current_byte >> (BITS_SAVED - count));
+}
+
+ /* Fill in empty bit_buff->current_byte from buffer */
+ /* Sets bit_buff->error if buffer is exhausted */
+
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff)
+{
+ if (bit_buff->pos >= bit_buff->end)
+ {
+ bit_buff->error= 1;
+ bit_buff->current_byte=0;
+ return;
+ }
+#if BITS_SAVED == 64
+ bit_buff->current_byte= ((((uint) ((uchar) bit_buff->pos[7]))) +
+ (((uint) ((uchar) bit_buff->pos[6])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[5])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[4])) << 24) +
+ ((ulonglong)
+ ((((uint) ((uchar) bit_buff->pos[3]))) +
+ (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32));
+ bit_buff->pos+=8;
+#else
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (((uint) ((uchar) bit_buff->pos[3])) +
+ (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 24));
+ bit_buff->pos+=4;
+#else
+ bit_buff->current_byte= (uint) (((uint) ((uchar) bit_buff->pos[1]))+
+ (((uint) ((uchar) bit_buff->pos[0])) << 8));
+ bit_buff->pos+=2;
+#endif
+#endif
+}
+
+ /* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+ reg2 uint power=1;
+
+ while ((value>>=1))
+ power++;
+ return (power);
+}
+
+
+/*****************************************************************************
+ Some redefined functions to handle files when we are using memmap
+*****************************************************************************/
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_MMAP
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos);
+static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS,
+ my_bool);
+
+my_bool _ma_memmap_file(MARIA_HA *info)
+{
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_memmap_file");
+
+ if (!info->s->file_map)
+ {
+ if (my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) <
+ share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN)
+ {
+ DBUG_PRINT("warning",("File isn't extended for memmap"));
+ DBUG_RETURN(0);
+ }
+ if (_ma_dynmap_file(info, share->state.state.data_file_length))
+ DBUG_RETURN(0);
+ }
+ info->opt_flag|= MEMMAP_USED;
+ info->read_record= share->read_record= _ma_read_mempack_record;
+ share->scan= _ma_read_rnd_mempack_record;
+ DBUG_RETURN(1);
+}
+
+
+void _ma_unmap_file(MARIA_HA *info)
+{
+ VOID(my_munmap(info->s->file_map,
+ (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+}
+
+
+static uchar *
+_ma_mempack_get_block_info(MARIA_HA *maria,
+ MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p,
+ size_t *rec_buff_size_p,
+ uchar *header)
+{
+ header+= read_pack_length((uint) maria->s->pack.version, header,
+ &info->rec_len);
+ if (maria->s->base.blobs)
+ {
+ header+= read_pack_length((uint) maria->s->pack.version, header,
+ &info->blob_len);
+ /* _ma_alloc_rec_buff sets my_errno on error */
+ if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+ info->blob_len + maria->s->base.extra_rec_buff_size))
+ return 0; /* not enough memory */
+ bit_buff->blob_pos= (uchar*) *rec_buff_p;
+ bit_buff->blob_end= (uchar*) *rec_buff_p + info->blob_len;
+ }
+ return header;
+}
+
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share=info->s;
+ uchar *pos;
+ DBUG_ENTER("maria_read_mempack_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno); /* _search() didn't find record */
+
+ if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+ &block_info, &info->rec_buff,
+ &info->rec_buff_size,
+ (uchar*) share->file_map+
+ filepos)))
+ DBUG_RETURN(my_errno);
+ DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ pos, block_info.rec_len));
+}
+
+
+/*ARGSUSED*/
+static int _ma_read_rnd_mempack_record(MARIA_HA *info,
+ uchar *buf,
+ register MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks
+ __attribute__((unused)))
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share=info->s;
+ uchar *pos,*start;
+ DBUG_ENTER("_ma_read_rnd_mempack_record");
+
+ if (filepos >= share->state.state.data_file_length)
+ {
+ my_errno=HA_ERR_END_OF_FILE;
+ goto err;
+ }
+ if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+ &block_info,
+ &info->rec_buff,
+ &info->rec_buff_size,
+ (uchar*)
+ (start= share->file_map +
+ filepos))))
+ goto err;
+#ifndef DBUG_OFF
+ if (block_info.rec_len > info->s->max_pack_length)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+#endif
+ info->packed_length=block_info.rec_len;
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len;
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+ DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ pos, block_info.rec_len));
+ err:
+ DBUG_RETURN(my_errno);
+}
+
+#endif /* HAVE_MMAP */
+
+ /* Save length of row */
+
+uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length)
+{
+ if (length < 254)
+ {
+ *(uchar*) block_buff= (uchar) length;
+ return 1;
+ }
+ if (length <= 65535)
+ {
+ *(uchar*) block_buff=254;
+ int2store(block_buff+1,(uint) length);
+ return 3;
+ }
+ *(uchar*) block_buff=255;
+ if (version == 1) /* old format */
+ {
+ DBUG_ASSERT(length <= 0xFFFFFF);
+ int3store(block_buff + 1, (ulong) length);
+ return 4;
+ }
+ else
+ {
+ int4store(block_buff + 1, (ulong) length);
+ return 5;
+ }
+}
+
+
+static uint read_pack_length(uint version, const uchar *buf, ulong *length)
+{
+ if (buf[0] < 254)
+ {
+ *length= buf[0];
+ return 1;
+ }
+ else if (buf[0] == 254)
+ {
+ *length= uint2korr(buf + 1);
+ return 3;
+ }
+ if (version == 1) /* old format */
+ {
+ *length= uint3korr(buf + 1);
+ return 4;
+ }
+ else
+ {
+ *length= uint4korr(buf + 1);
+ return 5;
+ }
+}
+
+
+uint _ma_calc_pack_length(uint version, ulong length)
+{
+ return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5;
+}
diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c
new file mode 100644
index 00000000000..f749414474f
--- /dev/null
+++ b/storage/maria/ma_page.c
@@ -0,0 +1,188 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read and write key blocks */
+
+#include "maria_def.h"
+
+ /* Fetch a key-page in memory */
+
+uchar *_ma_fetch_keypage(register MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t page, int level,
+ uchar *buff,
+ int return_buffer __attribute__ ((unused)))
+{
+ uchar *tmp;
+ uint page_size;
+ DBUG_ENTER("_ma_fetch_keypage");
+ DBUG_PRINT("enter",("page: %ld", (long) page));
+
+ DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length);
+ /*
+ TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when
+ LSN on the pages will be implemented
+ */
+ tmp= pagecache_read(info->s->pagecache, &info->s->kfile,
+ page / keyinfo->block_length, level, buff,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, 0);
+ if (tmp == info->buff)
+ info->keyread_buff_used=1;
+ else if (!tmp)
+ {
+ DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno));
+ info->last_keypage=HA_OFFSET_ERROR;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ info->last_keypage=page;
+ page_size= maria_data_on_page(tmp);
+ if (page_size < 4 || page_size > keyinfo->block_length)
+ {
+ DBUG_PRINT("error",("page %lu had wrong page length: %u",
+ (ulong) page, page_size));
+ DBUG_DUMP("page", (char*) tmp, keyinfo->block_length);
+ info->last_keypage = HA_OFFSET_ERROR;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno= HA_ERR_CRASHED;
+ tmp= 0;
+ }
+ DBUG_RETURN(tmp);
+} /* _ma_fetch_keypage */
+
+
+ /* Write a key-page on disk */
+
+int _ma_write_keypage(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ my_off_t page, int level, uchar *buff)
+{
+ DBUG_ENTER("_ma_write_keypage");
+
+#ifdef EXTRA_DEBUG /* Safety check */
+ if (page < info->s->base.keystart ||
+ page+keyinfo->block_length > info->state->key_file_length ||
+ (page & (MARIA_MIN_KEY_BLOCK_LENGTH-1)))
+ {
+ DBUG_PRINT("error",("Trying to write inside key status region: "
+ "key_start: %lu length: %lu page: %lu",
+ (long) info->s->base.keystart,
+ (long) info->state->key_file_length,
+ (long) page));
+ my_errno=EINVAL;
+ DBUG_RETURN((-1));
+ }
+ DBUG_PRINT("page",("write page at: %lu",(long) page));
+ DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff));
+#endif
+
+#ifdef HAVE_purify
+ {
+ /* Clear unitialized part of page to avoid valgrind/purify warnings */
+ uint length= maria_data_on_page(buff);
+ bzero((uchar*) buff+length,keyinfo->block_length-length);
+ length=keyinfo->block_length;
+ }
+#endif
+
+ DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length);
+ /*
+ TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when
+ LSN on the pages will be implemented
+ */
+ DBUG_RETURN(pagecache_write(info->s->pagecache,
+ &info->s->kfile, page / keyinfo->block_length,
+ level, buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0));
+} /* maria_write_keypage */
+
+
+ /* Remove page from disk */
+
+int _ma_dispose(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos,
+ int level)
+{
+ my_off_t old_link;
+ char buff[8];
+ uint offset;
+ pgcache_page_no_t page_no;
+ DBUG_ENTER("_ma_dispose");
+ DBUG_PRINT("enter",("pos: %ld", (long) pos));
+
+ old_link= info->s->state.key_del;
+ info->s->state.key_del= pos;
+ page_no= pos / keyinfo->block_length;
+ offset= pos % keyinfo->block_length;
+ mi_sizestore(buff,old_link);
+ info->s->state.changed|= STATE_NOT_SORTED_PAGES;
+
+ DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length &&
+ info->s->pagecache->block_size == info->s->block_size);
+ /*
+ TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when
+ LSN on the pages will be implemented
+ */
+ DBUG_RETURN(pagecache_write_part(info->s->pagecache,
+ &info->s->kfile, page_no, level, buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0,
+ offset, sizeof(buff), 0, 0));
+} /* _ma_dispose */
+
+
+ /* Make new page on disk */
+
+my_off_t _ma_new(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level)
+{
+ my_off_t pos;
+ uchar *buff;
+ DBUG_ENTER("_ma_new");
+
+ if ((pos= info->s->state.key_del) == HA_OFFSET_ERROR)
+ {
+ if (info->state->key_file_length >=
+ info->s->base.max_key_file_length - keyinfo->block_length)
+ {
+ my_errno=HA_ERR_INDEX_FILE_FULL;
+ DBUG_RETURN(HA_OFFSET_ERROR);
+ }
+ pos=info->state->key_file_length;
+ info->state->key_file_length+= keyinfo->block_length;
+ }
+ else
+ {
+ buff= alloca(info->s->block_size);
+ DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length &&
+ info->s->pagecache->block_size == info->s->block_size);
+ /*
+ TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when
+ LSN on the pages will be implemented
+ */
+ DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length);
+ if (!pagecache_read(info->s->pagecache,
+ &info->s->kfile, pos / keyinfo->block_length, level,
+ buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0))
+ pos= HA_OFFSET_ERROR;
+ else
+ info->s->state.key_del= mi_sizekorr(buff);
+ }
+ info->s->state.changed|= STATE_NOT_SORTED_PAGES;
+ DBUG_PRINT("exit",("Pos: %ld",(long) pos));
+ DBUG_RETURN(pos);
+} /* _ma_new */
diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c
new file mode 100755
index 00000000000..9f450d25c50
--- /dev/null
+++ b/storage/maria/ma_pagecache.c
@@ -0,0 +1,4197 @@
+/* Copyright (C) 2000-2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ These functions handle page cacheing for Maria tables.
+
+ One cache can handle many files.
+ It must contain buffers of the same blocksize.
+ init_pagecache() should be used to init cache handler.
+
+ The free list (free_block_list) is a stack like structure.
+ When a block is freed by free_block(), it is pushed onto the stack.
+ When a new block is required it is first tried to pop one from the stack.
+ If the stack is empty, it is tried to get a never-used block from the pool.
+ If this is empty too, then a block is taken from the LRU ring, flushing it
+ to disk, if necessary. This is handled in find_block().
+ With the new free list, the blocks can have three temperatures:
+ hot, warm and cold (which is free). This is remembered in the block header
+ by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the
+ temperature is necessary to correctly count the number of warm blocks,
+ which is required to decide when blocks are allowed to become hot. Whenever
+ a block is inserted to another (sub-)chain, we take the old and new
+ temperature into account to decide if we got one more or less warm block.
+ blocks_unused is the sum of never used blocks in the pool and of currently
+ free blocks. blocks_used is the number of blocks fetched from the pool and
+ as such gives the maximum number of in-use blocks at any time.
+*/
+
+#include "maria_def.h"
+#include <m_string.h>
+#include "ma_pagecache.h"
+#include <my_bit.h>
+#include <errno.h>
+#include <stdarg.h>
+
+/*
+ Some compilation flags have been added specifically for this module
+ to control the following:
+ - not to let a thread to yield the control when reading directly
+ from page cache, which might improve performance in many cases;
+ to enable this add:
+ #define SERIALIZED_READ_FROM_CACHE
+ - to set an upper bound for number of threads simultaneously
+ using the page cache; this setting helps to determine an optimal
+ size for hash table and improve performance when the number of
+ blocks in the page cache much less than the number of threads
+ accessing it;
+ to set this number equal to <N> add
+ #define MAX_THREADS <N>
+ - to substitute calls of pthread_cond_wait for calls of
+ pthread_cond_timedwait (wait with timeout set up);
+ this setting should be used only when you want to trap a deadlock
+ situation, which theoretically should not happen;
+ to set timeout equal to <T> seconds add
+ #define PAGECACHE_TIMEOUT <T>
+ - to enable the module traps and to send debug information from
+ page cache module to a special debug log add:
+ #define PAGECACHE_DEBUG
+ the name of this debug log file <LOG NAME> can be set through:
+ #define PAGECACHE_DEBUG_LOG <LOG NAME>
+ if the name is not defined, it's set by default;
+ if the PAGECACHE_DEBUG flag is not set up and we are in a debug
+ mode, i.e. when ! defined(DBUG_OFF), the debug information from the
+ module is sent to the regular debug log.
+
+ Example of the settings:
+ #define SERIALIZED_READ_FROM_CACHE
+ #define MAX_THREADS 100
+ #define PAGECACHE_TIMEOUT 1
+ #define PAGECACHE_DEBUG
+ #define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log"
+*/
+
+/*
+ In key cache we have external raw locking here we use
+ SERIALIZED_READ_FROM_CACHE to avoid problem of reading
+ not consistent data from the page.
+ (keycache functions (key_cache_read(), key_cache_insert() and
+ key_cache_write()) rely on external MyISAM lock, we don't)
+*/
+#define SERIALIZED_READ_FROM_CACHE yes
+
+#define PCBLOCK_INFO(B) \
+ DBUG_PRINT("info", \
+ ("block: 0x%lx file: %lu page: %lu s: %0x hshL: 0x%lx req: %u/%u " \
+ "wrlocks: %u", \
+ (ulong)(B), \
+ (ulong)((B)->hash_link ? \
+ (B)->hash_link->file.file : \
+ 0), \
+ (ulong)((B)->hash_link ? \
+ (B)->hash_link->pageno : \
+ 0), \
+ (B)->status, \
+ (ulong)(B)->hash_link, \
+ (uint) (B)->requests, \
+ (uint)((B)->hash_link ? \
+ (B)->hash_link->requests : \
+ 0), \
+ block->wlocks))
+
+/* TODO: put it to my_static.c */
+my_bool my_disable_flush_pagecache_blocks= 0;
+/**
+ when flushing pages of a file, it can happen that we take some dirty blocks
+ out of changed_blocks[]; Checkpoint must not run at this moment.
+*/
+uint changed_blocks_is_incomplete= 0;
+
+#define STRUCT_PTR(TYPE, MEMBER, a) \
+ (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
+
+/* types of condition variables */
+#define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */
+#define COND_FOR_SAVED 1 /* queue of thread waiting for flush */
+#define COND_FOR_WRLOCK 2 /* queue of write lock */
+#define COND_SIZE 3 /* number of COND_* queues */
+
+/* offset of LSN on the page */
+#define PAGE_LSN_OFFSET 0
+
+typedef pthread_cond_t KEYCACHE_CONDVAR;
+
+/* descriptor of the page in the page cache block buffer */
+struct st_pagecache_page
+{
+ PAGECACHE_FILE file; /* file to which the page belongs to */
+ pgcache_page_no_t pageno; /* number of the page in the file */
+};
+
+/* element in the chain of a hash table bucket */
+struct st_pagecache_hash_link
+{
+ struct st_pagecache_hash_link
+ *next, **prev; /* to connect links in the same bucket */
+ struct st_pagecache_block_link
+ *block; /* reference to the block for the page: */
+ PAGECACHE_FILE file; /* from such a file */
+ pgcache_page_no_t pageno; /* this page */
+ uint requests; /* number of requests for the page */
+};
+
+/* simple states of a block */
+#define PCBLOCK_ERROR 1 /* an error occurred when performing disk i/o */
+#define PCBLOCK_READ 2 /* the is page in the block buffer */
+#define PCBLOCK_IN_SWITCH 4 /* block is preparing to read new page */
+#define PCBLOCK_REASSIGNED 8 /* block does not accept requests for old page */
+#define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */
+#define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */
+
+/* page status, returned by find_block */
+#define PAGE_READ 0
+#define PAGE_TO_BE_READ 1
+#define PAGE_WAIT_TO_BE_READ 2
+
+/* block temperature determines in which (sub-)chain the block currently is */
+enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT };
+
+/* debug info */
+#ifndef DBUG_OFF
+static const char *page_cache_page_type_str[]=
+{
+ /* used only for control page type changing during debugging */
+ "EMPTY",
+ "PLAIN",
+ "LSN",
+ "READ_UNKNOWN"
+};
+
+static const char *page_cache_page_write_mode_str[]=
+{
+ "DELAY",
+ "NOW",
+ "DONE"
+};
+
+static const char *page_cache_page_lock_str[]=
+{
+ "free -> free",
+ "read -> read",
+ "write -> write",
+ "free -> read",
+ "free -> write",
+ "read -> free",
+ "write -> free",
+ "write -> read"
+};
+
+static const char *page_cache_page_pin_str[]=
+{
+ "pinned -> pinned",
+ "unpinned -> unpinned",
+ "unpinned -> pinned",
+ "pinned -> unpinned"
+};
+
+
+typedef struct st_pagecache_pin_info
+{
+ struct st_pagecache_pin_info *next, **prev;
+ struct st_my_thread_var *thread;
+} PAGECACHE_PIN_INFO;
+
+/*
+ st_pagecache_lock_info structure should be kept in next, prev, thread part
+ compatible with st_pagecache_pin_info to be compatible in functions.
+*/
+
+typedef struct st_pagecache_lock_info
+{
+ struct st_pagecache_lock_info *next, **prev;
+ struct st_my_thread_var *thread;
+ my_bool write_lock;
+} PAGECACHE_LOCK_INFO;
+
+
+/* service functions maintain debugging info about pin & lock */
+
+
+/*
+ Links information about thread pinned/locked the block to the list
+
+ SYNOPSIS
+ info_link()
+ list the list to link in
+ node the node which should be linked
+*/
+
+static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node)
+{
+ if ((node->next= *list))
+ node->next->prev= &(node->next);
+ *list= node;
+ node->prev= list;
+}
+
+
+/*
+ Unlinks information about thread pinned/locked the block from the list
+
+ SYNOPSIS
+ info_unlink()
+ node the node which should be unlinked
+*/
+
+static void info_unlink(PAGECACHE_PIN_INFO *node)
+{
+ if ((*node->prev= node->next))
+ node->next->prev= node->prev;
+}
+
+
+/*
+ Finds information about given thread in the list of threads which
+ pinned/locked this block.
+
+ SYNOPSIS
+ info_find()
+ list the list where to find the thread
+ thread thread ID (reference to the st_my_thread_var
+ of the thread)
+
+ RETURN
+ 0 - the thread was not found
+ pointer to the information node of the thread in the list
+*/
+
+static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list,
+ struct st_my_thread_var *thread)
+{
+ register PAGECACHE_PIN_INFO *i= list;
+ for(; i != 0; i= i->next)
+ if (i->thread == thread)
+ return i;
+ return 0;
+}
+
+#endif /* !DBUG_OFF */
+
+/* page cache block */
+struct st_pagecache_block_link
+{
+ struct st_pagecache_block_link
+ *next_used, **prev_used; /* to connect links in the LRU chain (ring) */
+ struct st_pagecache_block_link
+ *next_changed, **prev_changed; /* for lists of file dirty/clean blocks */
+ struct st_pagecache_hash_link
+ *hash_link; /* backward ptr to referring hash_link */
+#ifndef DBUG_OFF
+ PAGECACHE_PIN_INFO *pin_list;
+ PAGECACHE_LOCK_INFO *lock_list;
+#endif
+ KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
+ uchar *buffer; /* buffer for the block page */
+ PAGECACHE_FILE *write_locker;
+ ulonglong last_hit_time; /* timestamp of the last hit */
+ WQUEUE
+ wqueue[COND_SIZE]; /* queues on waiting requests for new/old pages */
+ uint requests; /* number of requests for the block */
+ uint status; /* state of the block */
+ uint pins; /* pin counter */
+ uint wlocks; /* write locks counter */
+ enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot */
+ enum pagecache_page_type type; /* type of the block */
+ uint hits_left; /* number of hits left until promotion */
+ /** @brief LSN when first became dirty; LSN_MAX means "not yet set" */
+ LSN rec_lsn;
+};
+
+#ifndef DBUG_OFF
+/* debug checks */
+
+#ifdef NOT_USED
+static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_pin mode
+ __attribute__((unused)))
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread);
+ DBUG_ENTER("info_check_pin");
+ DBUG_PRINT("enter", ("thread: 0x%lx pin: %s",
+ (ulong) thread, page_cache_page_pin_str[mode]));
+ if (info)
+ {
+ if (mode == PAGECACHE_PIN_LEFT_UNPINNED)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_UNPINNED!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ else if (mode == PAGECACHE_PIN)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; PIN!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ if (mode == PAGECACHE_PIN_LEFT_PINNED)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_PINNED!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ else if (mode == PAGECACHE_UNPIN)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; UNPIN!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Debug function which checks current lock/pin state and requested changes
+
+ SYNOPSIS
+ info_check_lock()
+ lock requested lock changes
+ pin requested pin changes
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin)
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list,
+ thread);
+ DBUG_ENTER("info_check_lock");
+ switch(lock)
+ {
+ case PAGECACHE_LOCK_LEFT_UNLOCKED:
+ if (pin != PAGECACHE_PIN_LEFT_UNPINNED ||
+ info)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_LEFT_READLOCKED:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_PIN_LEFT_PINNED) ||
+ info == 0 || info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_LEFT_WRITELOCKED:
+ if (pin != PAGECACHE_PIN_LEFT_PINNED ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_READ:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_PIN) ||
+ info != 0)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE:
+ if (pin != PAGECACHE_PIN ||
+ info != 0)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_READ_UNLOCK:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_UNPIN) ||
+ info == 0 || info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE_UNLOCK:
+ if (pin != PAGECACHE_UNPIN ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE_TO_READ:
+ if ((pin != PAGECACHE_PIN_LEFT_PINNED &&
+ pin != PAGECACHE_UNPIN) ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ }
+ DBUG_RETURN(0);
+error:
+ DBUG_PRINT("info",
+ ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d,"
+ "to lock: %s, to pin: %s",
+ (ulong)thread, (ulong)block, test(info),
+ (info ? info->write_lock : 0),
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ DBUG_RETURN(1);
+}
+#endif /* NOT_USED */
+#endif /* !DBUG_OFF */
+
+#define FLUSH_CACHE 2000 /* sort this many blocks at once */
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block);
+static void test_key_cache(PAGECACHE *pagecache,
+ const char *where, my_bool lock);
+
+#define PAGECACHE_HASH(p, f, pos) (((ulong) (pos) + \
+ (ulong) (f).file) & (p->hash_entries-1))
+#define FILE_HASH(f) ((uint) (f).file & (PAGECACHE_CHANGED_BLOCKS_HASH - 1))
+
+#define DEFAULT_PAGECACHE_DEBUG_LOG "pagecache_debug.log"
+
+#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG)
+#define PAGECACHE_DEBUG_LOG DEFAULT_PAGECACHE_DEBUG_LOG
+#endif
+
+#if defined(PAGECACHE_DEBUG_LOG)
+static FILE *pagecache_debug_log= NULL;
+static void pagecache_debug_print _VARARGS((const char *fmt, ...));
+#define PAGECACHE_DEBUG_OPEN \
+ if (!pagecache_debug_log) \
+ { \
+ pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w"); \
+ (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ); \
+ }
+
+#define PAGECACHE_DEBUG_CLOSE \
+ if (pagecache_debug_log) \
+ { \
+ fclose(pagecache_debug_log); \
+ pagecache_debug_log= 0; \
+ }
+#else
+#define PAGECACHE_DEBUG_OPEN
+#define PAGECACHE_DEBUG_CLOSE
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG)
+#define KEYCACHE_DBUG_PRINT(l, m) \
+ { if (pagecache_debug_log) \
+ fprintf(pagecache_debug_log, "%s: ", l); \
+ pagecache_debug_print m; }
+
+#define KEYCACHE_DBUG_ASSERT(a) \
+ { if (! (a) && pagecache_debug_log) \
+ fclose(pagecache_debug_log); \
+ assert(a); }
+#else
+#define KEYCACHE_DBUG_PRINT(l, m) DBUG_PRINT(l, m)
+#define KEYCACHE_DBUG_ASSERT(a) DBUG_ASSERT(a)
+#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */
+
+#if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF)
+#ifdef THREAD
+static long pagecache_thread_id;
+#define KEYCACHE_THREAD_TRACE(l) \
+ KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id))
+
+#define KEYCACHE_THREAD_TRACE_BEGIN(l) \
+ { struct st_my_thread_var *thread_var= my_thread_var; \
+ pagecache_thread_id= thread_var->id; \
+ KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) }
+
+#define KEYCACHE_THREAD_TRACE_END(l) \
+ KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id))
+#else /* THREAD */
+#define KEYCACHE_THREAD_TRACE(l) KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_BEGIN(l) KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_END(l) KEYCACHE_DBUG_PRINT(l,(""))
+#endif /* THREAD */
+#else
+#define KEYCACHE_THREAD_TRACE_BEGIN(l)
+#define KEYCACHE_THREAD_TRACE_END(l)
+#define KEYCACHE_THREAD_TRACE(l)
+#endif /* defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) */
+
+#define PCBLOCK_NUMBER(p, b) \
+ ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK)))
+#define PAGECACHE_HASH_LINK_NUMBER(p, h) \
+ ((uint) (((char*)(h)-(char *) p->hash_link_root)/ \
+ sizeof(PAGECACHE_HASH_LINK)))
+
+#if (defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)) || defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex);
+#else
+#define pagecache_pthread_cond_wait pthread_cond_wait
+#endif
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex);
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex);
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond);
+#define pagecache_pthread_mutex_lock(M) \
+{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_mutex_lock(M);}
+#define pagecache_pthread_mutex_unlock(M) \
+{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_mutex_unlock(M);}
+#define pagecache_pthread_cond_signal(M) \
+{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_cond_signal(M);}
+#else
+#define pagecache_pthread_mutex_lock pthread_mutex_lock
+#define pagecache_pthread_mutex_unlock pthread_mutex_unlock
+#define pagecache_pthread_cond_signal pthread_cond_signal
+#endif /* defined(PAGECACHE_DEBUG) */
+
+extern my_bool translog_flush(LSN lsn);
+
+/*
+ Write page to the disk
+
+ SYNOPSIS
+ pagecache_fwrite()
+ pagecache - page cache pointer
+ filedesc - pagecache file descriptor structure
+ buffer - buffer which we will write
+ type - page type (plain or with LSN)
+ flags - MYF() flags
+
+ RETURN
+ 0 - OK
+ !=0 - Error
+*/
+
+static uint pagecache_fwrite(PAGECACHE *pagecache,
+ PAGECACHE_FILE *filedesc,
+ uchar *buffer,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_type type,
+ myf flags)
+{
+ DBUG_ENTER("pagecache_fwrite");
+ DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+ if (type == PAGECACHE_LSN_PAGE)
+ {
+ LSN lsn;
+ DBUG_PRINT("info", ("Log handler call"));
+ /* TODO: integrate with page format */
+ lsn= lsn_korr(buffer + PAGE_LSN_OFFSET);
+ DBUG_ASSERT(LSN_VALID(lsn));
+ translog_flush(lsn);
+ }
+ DBUG_RETURN(my_pwrite(filedesc->file, buffer, pagecache->block_size,
+ (pageno)<<(pagecache->shift), flags));
+}
+
+
+/*
+ Read page from the disk
+
+ SYNOPSIS
+ pagecache_fread()
+ pagecache - page cache pointer
+ filedesc - pagecache file descriptor structure
+ buffer - buffer in which we will read
+ pageno - page number
+ flags - MYF() flags
+*/
+#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \
+ my_pread((filedesc)->file, buffer, pagecache->block_size, \
+ (pageno)<<(pagecache->shift), flags)
+
+
+/*
+ next_power(value) is 2 at the power of (1+floor(log2(value)));
+ e.g. next_power(2)=4, next_power(3)=4.
+*/
+static inline uint next_power(uint value)
+{
+ return (uint) my_round_up_to_next_power((uint32) value) << 1;
+}
+
+
+/*
+ Initialize a page cache
+
+ SYNOPSIS
+ init_pagecache()
+ pagecache pointer to a page cache data structure
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for the key cache
+ division_limit division limit (may be zero)
+ age_threshold age threshold (may be zero)
+ block_size size of block (should be power of 2)
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ if pagecache->inited != 0 we assume that the key cache
+ is already initialized. This is for now used by myisamchk, but shouldn't
+ be something that a program should rely on!
+
+ It's assumed that no two threads call this function simultaneously
+ referring to the same key cache handle.
+
+*/
+
+int init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+ uint division_limit, uint age_threshold,
+ uint block_size)
+{
+ uint blocks, hash_links, length;
+ int error;
+ DBUG_ENTER("init_pagecache");
+ DBUG_ASSERT(block_size >= 512);
+
+ PAGECACHE_DEBUG_OPEN;
+ if (pagecache->inited && pagecache->disk_blocks > 0)
+ {
+ DBUG_PRINT("warning",("key cache already in use"));
+ DBUG_RETURN(0);
+ }
+
+ pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0;
+ pagecache->global_cache_read= pagecache->global_cache_write= 0;
+ pagecache->disk_blocks= -1;
+ if (! pagecache->inited)
+ {
+ pagecache->inited= 1;
+ pagecache->in_init= 0;
+ pthread_mutex_init(&pagecache->cache_lock, MY_MUTEX_INIT_FAST);
+ pagecache->resize_queue.last_thread= NULL;
+ }
+
+ pagecache->mem_size= use_mem;
+ pagecache->block_size= block_size;
+ pagecache->shift= my_bit_log2(block_size);
+ DBUG_PRINT("info", ("block_size: %u",
+ block_size));
+ DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size);
+
+ blocks= (int) (use_mem / (sizeof(PAGECACHE_BLOCK_LINK) +
+ 2 * sizeof(PAGECACHE_HASH_LINK) +
+ sizeof(PAGECACHE_HASH_LINK*) *
+ 5/4 + block_size));
+ /*
+ We need to support page cache with just one block to be able to do
+ scanning of rows-in-block files
+ */
+ if (blocks >= 1)
+ {
+ for ( ; ; )
+ {
+ /* Set my_hash_entries to the next bigger 2 power */
+ if ((pagecache->hash_entries= next_power(blocks)) <
+ (blocks) * 5/4)
+ pagecache->hash_entries<<= 1;
+ hash_links= 2 * blocks;
+#if defined(MAX_THREADS)
+ if (hash_links < MAX_THREADS + blocks - 1)
+ hash_links= MAX_THREADS + blocks - 1;
+#endif
+ while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) +
+ ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) +
+ ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) *
+ pagecache->hash_entries))) +
+ (((ulong) blocks) << pagecache->shift) > use_mem)
+ blocks--;
+ /* Allocate memory for cache page buffers */
+ if ((pagecache->block_mem=
+ my_large_malloc((ulong) blocks * pagecache->block_size,
+ MYF(MY_WME))))
+ {
+ /*
+ Allocate memory for blocks, hash_links and hash entries;
+ For each block 2 hash links are allocated
+ */
+ if ((pagecache->block_root=
+ (PAGECACHE_BLOCK_LINK*) my_malloc((uint) length,
+ MYF(0))))
+ break;
+ my_large_free(pagecache->block_mem, MYF(0));
+ pagecache->block_mem= 0;
+ }
+ if (blocks < 8)
+ {
+ my_errno= ENOMEM;
+ goto err;
+ }
+ blocks= blocks / 4*3;
+ }
+ pagecache->blocks_unused= (ulong) blocks;
+ pagecache->disk_blocks= (int) blocks;
+ pagecache->hash_links= hash_links;
+ pagecache->hash_root=
+ (PAGECACHE_HASH_LINK**) ((char*) pagecache->block_root +
+ ALIGN_SIZE(blocks*sizeof(PAGECACHE_BLOCK_LINK)));
+ pagecache->hash_link_root=
+ (PAGECACHE_HASH_LINK*) ((char*) pagecache->hash_root +
+ ALIGN_SIZE((sizeof(PAGECACHE_HASH_LINK*) *
+ pagecache->hash_entries)));
+ bzero((uchar*) pagecache->block_root,
+ pagecache->disk_blocks * sizeof(PAGECACHE_BLOCK_LINK));
+ bzero((uchar*) pagecache->hash_root,
+ pagecache->hash_entries * sizeof(PAGECACHE_HASH_LINK*));
+ bzero((uchar*) pagecache->hash_link_root,
+ pagecache->hash_links * sizeof(PAGECACHE_HASH_LINK));
+ pagecache->hash_links_used= 0;
+ pagecache->free_hash_list= NULL;
+ pagecache->blocks_used= pagecache->blocks_changed= 0;
+
+ pagecache->global_blocks_changed= 0;
+ pagecache->blocks_available=0; /* For debugging */
+
+ /* The LRU chain is empty after initialization */
+ pagecache->used_last= NULL;
+ pagecache->used_ins= NULL;
+ pagecache->free_block_list= NULL;
+ pagecache->time= 0;
+ pagecache->warm_blocks= 0;
+ pagecache->min_warm_blocks= (division_limit ?
+ blocks * division_limit / 100 + 1 :
+ blocks);
+ pagecache->age_threshold= (age_threshold ?
+ blocks * age_threshold / 100 :
+ blocks);
+
+ pagecache->cnt_for_resize_op= 0;
+ pagecache->resize_in_flush= 0;
+ pagecache->can_be_used= 1;
+
+ pagecache->waiting_for_hash_link.last_thread= NULL;
+ pagecache->waiting_for_block.last_thread= NULL;
+ DBUG_PRINT("exit",
+ ("disk_blocks: %d block_root: 0x%lx hash_entries: %d\
+ hash_root: 0x%lx hash_links: %d hash_link_root: 0x%lx",
+ pagecache->disk_blocks, (long) pagecache->block_root,
+ pagecache->hash_entries, (long) pagecache->hash_root,
+ pagecache->hash_links, (long) pagecache->hash_link_root));
+ bzero((uchar*) pagecache->changed_blocks,
+ sizeof(pagecache->changed_blocks[0]) *
+ PAGECACHE_CHANGED_BLOCKS_HASH);
+ bzero((uchar*) pagecache->file_blocks,
+ sizeof(pagecache->file_blocks[0]) *
+ PAGECACHE_CHANGED_BLOCKS_HASH);
+ }
+
+ pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0;
+ DBUG_RETURN((int) pagecache->disk_blocks);
+
+err:
+ error= my_errno;
+ pagecache->disk_blocks= 0;
+ pagecache->blocks= 0;
+ if (pagecache->block_mem)
+ {
+ my_large_free((uchar*) pagecache->block_mem, MYF(0));
+ pagecache->block_mem= NULL;
+ }
+ if (pagecache->block_root)
+ {
+ my_free((uchar*) pagecache->block_root, MYF(0));
+ pagecache->block_root= NULL;
+ }
+ my_errno= error;
+ pagecache->can_be_used= 0;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Flush all blocks in the key cache to disk
+*/
+
+#ifdef NOT_USED
+static int flush_all_key_blocks(PAGECACHE *pagecache)
+{
+#if defined(PAGECACHE_DEBUG)
+ uint cnt=0;
+#endif
+ while (pagecache->blocks_changed > 0)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->used_last->next_used ; ; block=block->next_used)
+ {
+ if (block->hash_link)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file,
+ FLUSH_RELEASE))
+ return 1;
+ break;
+ }
+ if (block == pagecache->used_last)
+ break;
+ }
+ }
+ return 0;
+}
+#endif /* NOT_USED */
+
+/*
+ Resize a key cache
+
+ SYNOPSIS
+ resize_pagecache()
+ pagecache pointer to a page cache data structure
+ use_mem total memory to use for the new key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ The function first compares the memory size parameter
+ with the key cache value.
+
+ If they differ the function free the the memory allocated for the
+ old key cache blocks by calling the end_pagecache function and
+ then rebuilds the key cache with new blocks by calling
+ init_key_cache.
+
+ The function starts the operation only when all other threads
+ performing operations with the key cache let her to proceed
+ (when cnt_for_resize=0).
+
+ Before being usable, this function needs:
+ - to receive fixes for BUG#17332 "changing key_buffer_size on a running
+ server can crash under load" similar to those done to the key cache
+ - to have us (Sanja) look at the additional constraints placed on
+ resizing, due to the page locking specific to this page cache.
+ So we disable it for now.
+*/
+#if NOT_USED /* keep disabled until code is fixed see above !! */
+int resize_pagecache(PAGECACHE *pagecache,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
+{
+ int blocks;
+#ifdef THREAD
+ struct st_my_thread_var *thread;
+ WQUEUE *wqueue;
+
+#endif
+ DBUG_ENTER("resize_pagecache");
+
+ if (!pagecache->inited)
+ DBUG_RETURN(pagecache->disk_blocks);
+
+ if(use_mem == pagecache->mem_size)
+ {
+ change_pagecache_param(pagecache, division_limit, age_threshold);
+ DBUG_RETURN(pagecache->disk_blocks);
+ }
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+#ifdef THREAD
+ wqueue= &pagecache->resize_queue;
+ thread= my_thread_var;
+ wqueue_link_into_queue(wqueue, thread);
+
+ while (wqueue->last_thread->next != thread)
+ {
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ }
+#endif
+
+ pagecache->resize_in_flush= 1;
+ if (flush_all_key_blocks(pagecache))
+ {
+ /* TODO: if this happens, we should write a warning in the log file ! */
+ pagecache->resize_in_flush= 0;
+ blocks= 0;
+ pagecache->can_be_used= 0;
+ goto finish;
+ }
+ pagecache->resize_in_flush= 0;
+ pagecache->can_be_used= 0;
+#ifdef THREAD
+ while (pagecache->cnt_for_resize_op)
+ {
+ KEYCACHE_DBUG_PRINT("resize_pagecache: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(pagecache->cnt_for_resize_op == 0);
+#endif
+
+ end_pagecache(pagecache, 0); /* Don't free mutex */
+ /* The following will work even if use_mem is 0 */
+ blocks= init_pagecache(pagecache, pagecache->block_size, use_mem,
+ division_limit, age_threshold);
+
+finish:
+#ifdef THREAD
+ wqueue_unlink_from_queue(wqueue, thread);
+ /* Signal for the next resize request to proceeed if any */
+ if (wqueue->last_thread)
+ {
+ KEYCACHE_DBUG_PRINT("resize_pagecache: signal",
+ ("thread %ld", wqueue->last_thread->next->id));
+ pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend);
+ }
+#endif
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(blocks);
+}
+#endif /* 0 */
+
+
+/*
+ Increment counter blocking resize key cache operation
+*/
+static inline void inc_counter_for_resize_op(PAGECACHE *pagecache)
+{
+ pagecache->cnt_for_resize_op++;
+}
+
+
+/*
+ Decrement counter blocking resize key cache operation;
+ Signal the operation to proceed when counter becomes equal zero
+*/
+static inline void dec_counter_for_resize_op(PAGECACHE *pagecache)
+{
+#ifdef THREAD
+ struct st_my_thread_var *last_thread;
+ if (!--pagecache->cnt_for_resize_op &&
+ (last_thread= pagecache->resize_queue.last_thread))
+ {
+ KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal",
+ ("thread %ld", last_thread->next->id));
+ pagecache_pthread_cond_signal(&last_thread->next->suspend);
+ }
+#else
+ pagecache->cnt_for_resize_op--;
+#endif
+}
+
+/*
+ Change the page cache parameters
+
+ SYNOPSIS
+ change_pagecache_param()
+ pagecache pointer to a page cache data structure
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ Presently the function resets the key cache parameters
+ concerning midpoint insertion strategy - division_limit and
+ age_threshold.
+*/
+
+void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+ uint age_threshold)
+{
+ DBUG_ENTER("change_pagecache_param");
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (division_limit)
+ pagecache->min_warm_blocks= (pagecache->disk_blocks *
+ division_limit / 100 + 1);
+ if (age_threshold)
+ pagecache->age_threshold= (pagecache->disk_blocks *
+ age_threshold / 100);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Removes page cache from memory. Does NOT flush pages to disk.
+
+ SYNOPSIS
+ end_pagecache()
+ pagecache page cache handle
+ cleanup Complete free (Free also mutex for key cache)
+
+ RETURN VALUE
+ none
+*/
+
+void end_pagecache(PAGECACHE *pagecache, my_bool cleanup)
+{
+ DBUG_ENTER("end_pagecache");
+ DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) pagecache));
+
+ if (!pagecache->inited)
+ DBUG_VOID_RETURN;
+
+ if (pagecache->disk_blocks > 0)
+ {
+ if (pagecache->block_mem)
+ {
+ my_large_free((uchar*) pagecache->block_mem, MYF(0));
+ pagecache->block_mem= NULL;
+ my_free((uchar*) pagecache->block_root, MYF(0));
+ pagecache->block_root= NULL;
+ }
+ pagecache->disk_blocks= -1;
+ /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
+ pagecache->blocks_changed= 0;
+ }
+
+ DBUG_PRINT("status", ("used: %lu changed: %lu w_requests: %lu "
+ "writes: %lu r_requests: %lu reads: %lu",
+ pagecache->blocks_used, pagecache->global_blocks_changed,
+ (ulong) pagecache->global_cache_w_requests,
+ (ulong) pagecache->global_cache_write,
+ (ulong) pagecache->global_cache_r_requests,
+ (ulong) pagecache->global_cache_read));
+
+ if (cleanup)
+ {
+ pthread_mutex_destroy(&pagecache->cache_lock);
+ pagecache->inited= pagecache->can_be_used= 0;
+ PAGECACHE_DEBUG_CLOSE;
+ }
+ DBUG_VOID_RETURN;
+} /* end_pagecache */
+
+
+/*
+ Unlink a block from the chain of dirty/clean blocks
+*/
+
+static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block)
+{
+ if (block->next_changed)
+ block->next_changed->prev_changed= block->prev_changed;
+ *block->prev_changed= block->next_changed;
+}
+
+
+/*
+ Link a block into the chain of dirty/clean blocks
+*/
+
+static inline void link_changed(PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_BLOCK_LINK **phead)
+{
+ block->prev_changed= phead;
+ if ((block->next_changed= *phead))
+ (*phead)->prev_changed= &block->next_changed;
+ *phead= block;
+}
+
+
+/*
+ Unlink a block from the chain of dirty/clean blocks, if it's asked for,
+ and link it to the chain of clean blocks for the specified file
+*/
+
+static void link_to_file_list(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_FILE *file, my_bool unlink)
+{
+ if (unlink)
+ unlink_changed(block);
+ link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]);
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ block->status&= ~PCBLOCK_CHANGED;
+ block->rec_lsn= LSN_MAX;
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ }
+}
+
+
+/*
+ Unlink a block from the chain of clean blocks for the specified
+ file and link it to the chain of dirty blocks for this file
+*/
+
+static inline void link_to_changed_list(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block)
+{
+ unlink_changed(block);
+ link_changed(block,
+ &pagecache->changed_blocks[FILE_HASH(block->hash_link->file)]);
+ block->status|=PCBLOCK_CHANGED;
+ pagecache->blocks_changed++;
+ pagecache->global_blocks_changed++;
+}
+
+
+/*
+ Link a block to the LRU chain at the beginning or at the end of
+ one of two parts.
+
+ SYNOPSIS
+ link_block()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to link to the LRU chain
+ hot <-> to link the block into the hot subchain
+ at_end <-> to link the block at the end of the subchain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ The LRU chain is represented by a curcular list of block structures.
+ The list is double-linked of the type (**prev,*next) type.
+ The LRU chain is divided into two parts - hot and warm.
+ There are two pointers to access the last blocks of these two
+ parts. The beginning of the warm part follows right after the
+ end of the hot part.
+ Only blocks of the warm part can be used for replacement.
+ The first block from the beginning of this subchain is always
+ taken for eviction (pagecache->last_used->next)
+
+ LRU chain: +------+ H O T +------+
+ +----| end |----...<----| beg |----+
+ | +------+last +------+ |
+ v<-link in latest hot (new end) |
+ | link in latest warm (new end)->^
+ | +------+ W A R M +------+ |
+ +----| beg |---->...----| end |----+
+ +------+ +------+ins
+ first for eviction
+*/
+
+static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+ my_bool hot, my_bool at_end)
+{
+ PAGECACHE_BLOCK_LINK *ins;
+ PAGECACHE_BLOCK_LINK **ptr_ins;
+
+ PCBLOCK_INFO(block);
+ KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests));
+#ifdef THREAD
+ if (!hot && pagecache->waiting_for_block.last_thread)
+ {
+ /* Signal that in the LRU warm sub-chain an available block has appeared */
+ struct st_my_thread_var *last_thread=
+ pagecache->waiting_for_block.last_thread;
+ struct st_my_thread_var *first_thread= last_thread->next;
+ struct st_my_thread_var *next_thread= first_thread;
+ PAGECACHE_HASH_LINK *hash_link=
+ (PAGECACHE_HASH_LINK *) first_thread->opt_info;
+ struct st_my_thread_var *thread;
+ do
+ {
+ thread= next_thread;
+ next_thread= thread->next;
+ /*
+ We notify about the event all threads that ask
+ for the same page as the first thread in the queue
+ */
+ if ((PAGECACHE_HASH_LINK *) thread->opt_info == hash_link)
+ {
+ KEYCACHE_DBUG_PRINT("link_block: signal", ("thread: %ld", thread->id));
+ pagecache_pthread_cond_signal(&thread->suspend);
+ wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread);
+ block->requests++;
+ }
+ }
+ while (thread != last_thread);
+ hash_link->block= block;
+ KEYCACHE_THREAD_TRACE("link_block: after signaling");
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_PRINT("link_block",
+ ("linked,unlinked block: %u status: %x #requests: %u #available: %u",
+ PCBLOCK_NUMBER(pagecache, block), block->status,
+ block->requests, pagecache->blocks_available));
+#endif
+ return;
+ }
+#else /* THREAD */
+ KEYCACHE_DBUG_ASSERT(! (!hot && pagecache->waiting_for_block.last_thread));
+ /* Condition not transformed using DeMorgan, to keep the text identical */
+#endif /* THREAD */
+ ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last;
+ ins= *ptr_ins;
+ if (ins)
+ {
+ ins->next_used->prev_used= &block->next_used;
+ block->next_used= ins->next_used;
+ block->prev_used= &ins->next_used;
+ ins->next_used= block;
+ if (at_end)
+ *ptr_ins= block;
+ }
+ else
+ {
+ /* The LRU chain is empty */
+ pagecache->used_last= pagecache->used_ins= block->next_used= block;
+ block->prev_used= &block->next_used;
+ }
+ KEYCACHE_THREAD_TRACE("link_block");
+#if defined(PAGECACHE_DEBUG)
+ pagecache->blocks_available++;
+ KEYCACHE_DBUG_PRINT("link_block",
+ ("linked block: %u:%1u status: %x #requests: %u #available: %u",
+ PCBLOCK_NUMBER(pagecache, block), at_end, block->status,
+ block->requests, pagecache->blocks_available));
+ KEYCACHE_DBUG_ASSERT((ulong) pagecache->blocks_available <=
+ pagecache->blocks_used);
+#endif
+}
+
+
+/*
+ Unlink a block from the LRU chain
+
+ SYNOPSIS
+ unlink_block()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to unlink from the LRU chain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ See NOTES for link_block
+*/
+
+static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("unlink_block");
+ DBUG_PRINT("unlink_block", ("unlink 0x%lx", (ulong)block));
+ if (block->next_used == block)
+ /* The list contains only one member */
+ pagecache->used_last= pagecache->used_ins= NULL;
+ else
+ {
+ block->next_used->prev_used= block->prev_used;
+ *block->prev_used= block->next_used;
+ if (pagecache->used_last == block)
+ pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+ next_used, block->prev_used);
+ if (pagecache->used_ins == block)
+ pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+ next_used, block->prev_used);
+ }
+ block->next_used= NULL;
+
+ KEYCACHE_THREAD_TRACE("unlink_block");
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0);
+ pagecache->blocks_available--;
+ KEYCACHE_DBUG_PRINT("unlink_block",
+ ("unlinked block: 0x%lx (%u) status: %x #requests: %u #available: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status,
+ block->requests, pagecache->blocks_available));
+ PCBLOCK_INFO(block);
+#endif
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Register requests for a block
+
+ SYNOPSIS
+ reg_requests()
+ pagecache this page cache reference
+ block the block we request reference
+ count how many requests we register (it is 1 everywhere)
+
+ NOTE
+ Registration of request means we are going to use this block so we exclude
+ it from the LRU if it is first request
+*/
+static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+ int count)
+{
+ DBUG_ENTER("reg_requests");
+ DBUG_PRINT("enter", ("block: 0x%lx (%u) status: %x reqs: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status, block->requests));
+ PCBLOCK_INFO(block);
+ if (! block->requests)
+ /* First request for the block unlinks it */
+ unlink_block(pagecache, block);
+ block->requests+= count;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unregister request for a block
+ linking it to the LRU chain if it's the last request
+
+ SYNOPSIS
+ unreg_request()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to link to the LRU chain
+ at_end <-> to link the block at the end of the LRU chain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ Every linking to the LRU chain decrements by one a special block
+ counter (if it's positive). If the at_end parameter is TRUE the block is
+ added either at the end of warm sub-chain or at the end of hot sub-chain.
+ It is added to the hot subchain if its counter is zero and number of
+ blocks in warm sub-chain is not less than some low limit (determined by
+ the division_limit parameter). Otherwise the block is added to the warm
+ sub-chain. If the at_end parameter is FALSE the block is always added
+ at beginning of the warm sub-chain.
+ Thus a warm block can be promoted to the hot sub-chain when its counter
+ becomes zero for the first time.
+ At the same time the block at the very beginning of the hot subchain
+ might be moved to the beginning of the warm subchain if it stays untouched
+ for a too long time (this time is determined by parameter age_threshold).
+*/
+
+static void unreg_request(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block, int at_end)
+{
+ DBUG_ENTER("unreg_request");
+ DBUG_PRINT("enter", ("block 0x%lx (%u) status: %x reqs: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status, block->requests));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->requests > 0);
+ if (! --block->requests)
+ {
+ my_bool hot;
+ if (block->hits_left)
+ block->hits_left--;
+ hot= !block->hits_left && at_end &&
+ pagecache->warm_blocks > pagecache->min_warm_blocks;
+ if (hot)
+ {
+ if (block->temperature == PCBLOCK_WARM)
+ pagecache->warm_blocks--;
+ block->temperature= PCBLOCK_HOT;
+ KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+ pagecache->warm_blocks));
+ }
+ link_block(pagecache, block, hot, (my_bool)at_end);
+ block->last_hit_time= pagecache->time;
+ pagecache->time++;
+
+ block= pagecache->used_ins;
+ /* Check if we should link a hot block to the warm block */
+ if (block && pagecache->time - block->last_hit_time >
+ pagecache->age_threshold)
+ {
+ unlink_block(pagecache, block);
+ link_block(pagecache, block, 0, 0);
+ if (block->temperature != PCBLOCK_WARM)
+ {
+ pagecache->warm_blocks++;
+ block->temperature= PCBLOCK_WARM;
+ }
+ KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+ pagecache->warm_blocks));
+ }
+ }
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Remove a reader of the page in block
+*/
+
+static inline void remove_reader(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("remove_reader");
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->hash_link->requests > 0);
+#ifdef THREAD
+ if (! --block->hash_link->requests && block->condvar)
+ pagecache_pthread_cond_signal(block->condvar);
+#else
+ --block->hash_link->requests;
+#endif
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Wait until the last reader of the page in block
+ signals on its termination
+*/
+
+static inline void wait_for_readers(PAGECACHE *pagecache
+ __attribute__((unused)),
+ PAGECACHE_BLOCK_LINK *block)
+{
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ while (block->hash_link->requests)
+ {
+ KEYCACHE_DBUG_PRINT("wait_for_readers: wait",
+ ("suspend thread: %ld block: %u",
+ thread->id, PCBLOCK_NUMBER(pagecache, block)));
+ block->condvar= &thread->suspend;
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ block->condvar= NULL;
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0);
+#endif
+}
+
+
+/*
+ Add a hash link to a bucket in the hash_table
+*/
+
+static inline void link_hash(PAGECACHE_HASH_LINK **start,
+ PAGECACHE_HASH_LINK *hash_link)
+{
+ if (*start)
+ (*start)->prev= &hash_link->next;
+ hash_link->next= *start;
+ hash_link->prev= start;
+ *start= hash_link;
+}
+
+
+/*
+ Remove a hash link from the hash table
+*/
+
+static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link)
+{
+ KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u pos_ %lu #requests=%u",
+ (uint) hash_link->file.file, (ulong) hash_link->pageno,
+ hash_link->requests));
+ KEYCACHE_DBUG_ASSERT(hash_link->requests == 0);
+ if ((*hash_link->prev= hash_link->next))
+ hash_link->next->prev= hash_link->prev;
+ hash_link->block= NULL;
+#ifdef THREAD
+ if (pagecache->waiting_for_hash_link.last_thread)
+ {
+ /* Signal that a free hash link has appeared */
+ struct st_my_thread_var *last_thread=
+ pagecache->waiting_for_hash_link.last_thread;
+ struct st_my_thread_var *first_thread= last_thread->next;
+ struct st_my_thread_var *next_thread= first_thread;
+ PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->opt_info);
+ struct st_my_thread_var *thread;
+
+ hash_link->file= first_page->file;
+ hash_link->pageno= first_page->pageno;
+ do
+ {
+ PAGECACHE_PAGE *page;
+ thread= next_thread;
+ page= (PAGECACHE_PAGE *) thread->opt_info;
+ next_thread= thread->next;
+ /*
+ We notify about the event all threads that ask
+ for the same page as the first thread in the queue
+ */
+ if (page->file.file == hash_link->file.file &&
+ page->pageno == hash_link->pageno)
+ {
+ KEYCACHE_DBUG_PRINT("unlink_hash: signal", ("thread %ld", thread->id));
+ pagecache_pthread_cond_signal(&thread->suspend);
+ wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread);
+ }
+ }
+ while (thread != last_thread);
+ link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache,
+ hash_link->file,
+ hash_link->pageno)],
+ hash_link);
+ return;
+ }
+#else /* THREAD */
+ KEYCACHE_DBUG_ASSERT(! (pagecache->waiting_for_hash_link.last_thread));
+#endif /* THREAD */
+ hash_link->next= pagecache->free_hash_list;
+ pagecache->free_hash_list= hash_link;
+}
+
+
+/*
+ Get the hash link for the page if it is in the cache (do not put the
+ page in the cache if it is absent there)
+
+ SYNOPSIS
+ get_present_hash_link()
+ pagecache Pagecache reference
+ file file ID
+ pageno page number in the file
+ start where to put pointer to found hash bucket (for
+ direct referring it)
+
+ RETURN
+ found hashlink pointer
+*/
+
+static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ PAGECACHE_HASH_LINK ***start)
+{
+ reg1 PAGECACHE_HASH_LINK *hash_link;
+#if defined(PAGECACHE_DEBUG)
+ int cnt;
+#endif
+ DBUG_ENTER("get_present_hash_link");
+
+ KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu",
+ (uint) file->file, (ulong) pageno));
+
+ /*
+ Find the bucket in the hash table for the pair (file, pageno);
+ start contains the head of the bucket list,
+ hash_link points to the first member of the list
+ */
+ hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache,
+ *file, pageno)]);
+#if defined(PAGECACHE_DEBUG)
+ cnt= 0;
+#endif
+ /* Look for an element for the pair (file, pageno) in the bucket chain */
+ while (hash_link &&
+ (hash_link->pageno != pageno ||
+ hash_link->file.file != file->file))
+ {
+ hash_link= hash_link->next;
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ if (! (cnt <= pagecache->hash_links_used))
+ {
+ int i;
+ for (i=0, hash_link= **start ;
+ i < cnt ; i++, hash_link= hash_link->next)
+ {
+ KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu",
+ (uint) hash_link->file.file, (ulong) hash_link->pageno));
+ }
+ }
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used);
+#endif
+ }
+ if (hash_link)
+ {
+ /* Register the request for the page */
+ hash_link->requests++;
+ }
+ /*
+ As soon as the caller will release the page cache's lock, "hash_link"
+ will be potentially obsolete (unusable) information.
+ */
+ DBUG_RETURN(hash_link);
+}
+
+
+/*
+ Get the hash link for a page
+*/
+
+static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno)
+{
+ reg1 PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_HASH_LINK **start;
+
+ KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u pos: %lu",
+ (uint) file->file, (ulong) pageno));
+
+restart:
+ /* try to find the page in the cache */
+ hash_link= get_present_hash_link(pagecache, file, pageno,
+ &start);
+ if (!hash_link)
+ {
+ /* There is no hash link in the hash table for the pair (file, pageno) */
+ if (pagecache->free_hash_list)
+ {
+ hash_link= pagecache->free_hash_list;
+ pagecache->free_hash_list= hash_link->next;
+ }
+ else if (pagecache->hash_links_used < pagecache->hash_links)
+ {
+ hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++];
+ }
+ else
+ {
+#ifdef THREAD
+ /* Wait for a free hash link */
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_PAGE page;
+ KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting"));
+ page.file= *file;
+ page.pageno= pageno;
+ thread->opt_info= (void *) &page;
+ wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread);
+ KEYCACHE_DBUG_PRINT("get_hash_link: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ thread->opt_info= NULL;
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+#endif
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+ hash_link->file= *file;
+ hash_link->pageno= pageno;
+ link_hash(start, hash_link);
+ /* Register the request for the page */
+ hash_link->requests++;
+ }
+
+ return hash_link;
+}
+
+
+/*
+ Get a block for the file page requested by a pagecache read/write operation;
+ If the page is not in the cache return a free block, if there is none
+ return the lru block after saving its buffer if the page is dirty.
+
+ SYNOPSIS
+
+ find_block()
+ pagecache pointer to a page cache data structure
+ file handler for the file to read page from
+ pageno number of the page in the file
+ init_hits_left how initialize the block counter for the page
+ wrmode <-> get for writing
+ reg_req Register request to thye page
+ page_st out {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
+
+ RETURN VALUE
+ Pointer to the found block if successful, 0 - otherwise
+
+ NOTES.
+ For the page from file positioned at pageno the function checks whether
+ the page is in the key cache specified by the first parameter.
+ If this is the case it immediately returns the block.
+ If not, the function first chooses a block for this page. If there is
+ no not used blocks in the key cache yet, the function takes the block
+ at the very beginning of the warm sub-chain. It saves the page in that
+ block if it's dirty before returning the pointer to it.
+ The function returns in the page_st parameter the following values:
+ PAGE_READ - if page already in the block,
+ PAGE_TO_BE_READ - if it is to be read yet by the current thread
+ WAIT_TO_BE_READ - if it is to be read by another thread
+ If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+ It might happen that there are no blocks in LRU chain (in warm part) -
+ all blocks are unlinked for some read/write operations. Then the function
+ waits until first of this operations links any block back.
+*/
+
+static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ int init_hits_left,
+ my_bool wrmode,
+ my_bool reg_req,
+ int *page_st)
+{
+ PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_BLOCK_LINK *block;
+ int error= 0;
+ int page_status;
+
+ DBUG_ENTER("find_block");
+ KEYCACHE_THREAD_TRACE("find_block:begin");
+ DBUG_PRINT("enter", ("fd: %d pos: %lu wrmode: %d",
+ file->file, (ulong) pageno, wrmode));
+ KEYCACHE_DBUG_PRINT("find_block", ("fd: %d pos: %lu wrmode: %d",
+ file->file, (ulong) pageno,
+ wrmode));
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "start of find_block", 0););
+#endif
+
+restart:
+ /* Find the hash link for the requested page (file, pageno) */
+ hash_link= get_hash_link(pagecache, file, pageno);
+
+ page_status= -1;
+ if ((block= hash_link->block) &&
+ block->hash_link == hash_link && (block->status & PCBLOCK_READ))
+ page_status= PAGE_READ;
+
+ if (wrmode && pagecache->resize_in_flush)
+ {
+ /* This is a write request during the flush phase of a resize operation */
+
+ if (page_status != PAGE_READ)
+ {
+ /* We don't need the page in the cache: we are going to write on disk */
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ unlink_hash(pagecache, hash_link);
+ return 0;
+ }
+ if (!(block->status & PCBLOCK_IN_FLUSH))
+ {
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ /*
+ Remove block to invalidate the page in the block buffer
+ as we are going to write directly on disk.
+ Although we have an exclusive lock for the updated key part
+ the control can be yielded by the current thread as we might
+ have unfinished readers of other key parts in the block
+ buffer. Still we are guaranteed not to have any readers
+ of the key part we are writing into until the block is
+ removed from the cache as we set the PCBLOCK_REASSIGNED
+ flag (see the code below that handles reading requests).
+ */
+ free_block(pagecache, block);
+ return 0;
+ }
+ /* Wait until the page is flushed on disk */
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /*
+ Given the use of "resize_in_flush", it seems impossible
+ that this whole branch is ever entered in single-threaded case
+ because "(wrmode && pagecache->resize_in_flush)" cannot be true.
+ TODO: Check this, and then put the whole branch into the
+ "#ifdef THREAD" guard.
+ */
+#endif
+ }
+ /* Invalidate page in the block if it has not been done yet */
+ if (block->status)
+ free_block(pagecache, block);
+ return 0;
+ }
+
+ if (page_status == PAGE_READ &&
+ (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)))
+ {
+ /* This is a request for a page to be removed from cache */
+
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request for old page in block: %u "
+ "wrmode: %d block->status: %d",
+ PCBLOCK_NUMBER(pagecache, block), wrmode,
+ block->status));
+ /*
+ Only reading requests can proceed until the old dirty page is flushed,
+ all others are to be suspended, then resubmitted
+ */
+ if (!wrmode && !(block->status & PCBLOCK_REASSIGNED))
+ {
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ }
+ else
+ {
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request waiting for old page to be saved"));
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ /* Put the request into the queue of those waiting for the old page */
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ /* Wait until the request can be resubmitted */
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ }
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request for old page resubmitted"));
+ DBUG_PRINT("info", ("restarting..."));
+ /* Resubmit the request */
+ goto restart;
+ }
+ block->status&= ~PCBLOCK_IN_SWITCH;
+ }
+ else
+ {
+ /* This is a request for a new page or for a page not to be removed */
+ if (! block)
+ {
+ /* No block is assigned for the page yet */
+ if (pagecache->blocks_unused)
+ {
+ if (pagecache->free_block_list)
+ {
+ /* There is a block in the free list. */
+ block= pagecache->free_block_list;
+ pagecache->free_block_list= block->next_used;
+ block->next_used= NULL;
+ }
+ else
+ {
+ /* There are some never used blocks, take first of them */
+ block= &pagecache->block_root[pagecache->blocks_used];
+ block->buffer= ADD_TO_PTR(pagecache->block_mem,
+ ((ulong) pagecache->blocks_used*
+ pagecache->block_size),
+ uchar*);
+ pagecache->blocks_used++;
+ }
+ pagecache->blocks_unused--;
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status= 0;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+ block->requests= 1;
+ block->temperature= PCBLOCK_COLD;
+ block->hits_left= init_hits_left;
+ block->last_hit_time= 0;
+ block->rec_lsn= LSN_MAX;
+ link_to_file_list(pagecache, block, file, 0);
+ block->hash_link= hash_link;
+ hash_link->block= block;
+ page_status= PAGE_TO_BE_READ;
+ DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+ (ulong)block));
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("got free or never used block %u",
+ PCBLOCK_NUMBER(pagecache, block)));
+ }
+ else
+ {
+ /* There are no never used blocks, use a block from the LRU chain */
+
+ /*
+ Wait until a new block is added to the LRU chain;
+ several threads might wait here for the same page,
+ all of them must get the same block
+ */
+
+#ifdef THREAD
+ if (! pagecache->used_last)
+ {
+ struct st_my_thread_var *thread= my_thread_var;
+ thread->opt_info= (void *) hash_link;
+ wqueue_link_into_queue(&pagecache->waiting_for_block, thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+ thread->opt_info= NULL;
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(pagecache->used_last);
+#endif
+ block= hash_link->block;
+ if (! block)
+ {
+ /*
+ Take the first block from the LRU chain
+ unlinking it from the chain
+ */
+ block= pagecache->used_last->next_used;
+ block->hits_left= init_hits_left;
+ block->last_hit_time= 0;
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ hash_link->block= block;
+ }
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->pins == 0);
+
+ if (block->hash_link != hash_link &&
+ ! (block->status & PCBLOCK_IN_SWITCH) )
+ {
+ /* this is a primary request for a new page */
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status|= PCBLOCK_IN_SWITCH;
+
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("got block %u for new page",
+ PCBLOCK_NUMBER(pagecache, block)));
+
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ /* The block contains a dirty page - push it out of the cache */
+
+ KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ The call is thread safe because only the current
+ thread might change the block->hash_link value
+ */
+ DBUG_ASSERT(block->pins == 0);
+ error= pagecache_fwrite(pagecache,
+ &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ MYF(MY_NABP | MY_WAIT_IF_FULL));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ pagecache->global_cache_write++;
+ }
+
+ block->status|= PCBLOCK_REASSIGNED;
+ if (block->hash_link)
+ {
+ /*
+ Wait until all pending read requests
+ for this page are executed
+ (we could have avoided this waiting, if we had read
+ a page in the cache in a sweep, without yielding control)
+ */
+ wait_for_readers(pagecache, block);
+
+ /* Remove the hash link for this page from the hash table */
+ unlink_hash(pagecache, block->hash_link);
+ /* All pending requests for this page must be resubmitted */
+#ifdef THREAD
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+ }
+ link_to_file_list(pagecache, block, file,
+ (my_bool)(block->hash_link ? 1 : 0));
+ PCBLOCK_INFO(block);
+ block->status= error? PCBLOCK_ERROR : 0;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+ block->hash_link= hash_link;
+ page_status= PAGE_TO_BE_READ;
+ DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+ (ulong)block));
+
+ KEYCACHE_DBUG_ASSERT(block->hash_link->block == block);
+ KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link);
+ }
+ else
+ {
+ /* This is for secondary requests for a new page only */
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block->hash_link: %p hash_link: %p "
+ "block->status: %u", block->hash_link,
+ hash_link, block->status ));
+ page_status= (((block->hash_link == hash_link) &&
+ (block->status & PCBLOCK_READ)) ?
+ PAGE_READ : PAGE_WAIT_TO_BE_READ);
+ }
+ }
+ pagecache->global_cache_read++;
+ }
+ else
+ {
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block->hash_link: %p hash_link: %p "
+ "block->status: %u", block->hash_link,
+ hash_link, block->status ));
+ page_status= (((block->hash_link == hash_link) &&
+ (block->status & PCBLOCK_READ)) ?
+ PAGE_READ : PAGE_WAIT_TO_BE_READ);
+ }
+ }
+
+ KEYCACHE_DBUG_ASSERT(page_status != -1);
+ *page_st= page_status;
+ DBUG_PRINT("info",
+ ("block: 0x%lx fd: %u pos: %lu block->status: %u page_status: %u",
+ (ulong) block, (uint) file->file,
+ (ulong) pageno, block->status, (uint) page_status));
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block: 0x%lx fd: %d pos: %lu block->status: %u page_status: %d",
+ (ulong) block,
+ file->file, (ulong) pageno, block->status,
+ page_status));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "end of find_block",0););
+#endif
+ KEYCACHE_THREAD_TRACE("find_block:end");
+ DBUG_RETURN(block);
+}
+
+
+static void add_pin(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("add_pin");
+ DBUG_PRINT("enter", ("block: 0x%lx pins: %u",
+ (ulong) block,
+ block->pins));
+ PCBLOCK_INFO(block);
+ block->pins++;
+#ifndef DBUG_OFF
+ {
+ PAGECACHE_PIN_INFO *info=
+ (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0));
+ info->thread= my_thread_var;
+ info_link(&block->pin_list, info);
+ }
+#endif
+ DBUG_VOID_RETURN;
+}
+
+static void remove_pin(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("remove_pin");
+ DBUG_PRINT("enter", ("block: 0x%lx pins: %u",
+ (ulong) block,
+ block->pins));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->pins > 0);
+ block->pins--;
+#ifndef DBUG_OFF
+ {
+ PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var);
+ DBUG_ASSERT(info != 0);
+ info_unlink(info);
+ my_free((uchar*) info, MYF(0));
+ }
+#endif
+ DBUG_VOID_RETURN;
+}
+#ifndef DBUG_OFF
+static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0));
+ info->thread= my_thread_var;
+ info->write_lock= wl;
+ info_link((PAGECACHE_PIN_INFO **)&block->lock_list,
+ (PAGECACHE_PIN_INFO *)info);
+}
+static void info_remove_lock(PAGECACHE_BLOCK_LINK *block)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+ my_thread_var);
+ DBUG_ASSERT(info != 0);
+ info_unlink((PAGECACHE_PIN_INFO *)info);
+ my_free((uchar*)info, MYF(0));
+}
+static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+ my_thread_var);
+ DBUG_ASSERT(info != 0);
+ DBUG_ASSERT(info->write_lock != wl);
+ info->write_lock= wl;
+}
+#else
+#define info_add_lock(B,W)
+#define info_remove_lock(B)
+#define info_change_lock(B,W)
+#endif
+
+/*
+ Put on the block write lock
+
+ SYNOPSIS
+ get_wrlock()
+ pagecache pointer to a page cache data structure
+ block the block to work with
+ user_file Unique handler per handler file. Used to check if
+ we request many write locks withing the same
+ statement
+
+ RETURN
+ 0 - OK
+ 1 - Can't lock this block, need retry
+*/
+
+static my_bool get_wrlock(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_FILE *user_file)
+{
+ PAGECACHE_FILE file= block->hash_link->file;
+ pgcache_page_no_t pageno= block->hash_link->pageno;
+ DBUG_ENTER("get_wrlock");
+ DBUG_PRINT("info", ("the block 0x%lx "
+ "files %d(%d) pages %d(%d)",
+ (ulong)block,
+ file.file, block->hash_link->file.file,
+ pageno, block->hash_link->pageno));
+ PCBLOCK_INFO(block);
+ while (block->wlocks && block->write_locker != user_file)
+ {
+ /* Lock failed we will wait */
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ DBUG_PRINT("info", ("fail to lock, waiting... 0x%lx", (ulong)block));
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread);
+ dec_counter_for_resize_op(pagecache);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("get_wrlock: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ DBUG_ASSERT(0);
+#endif
+ PCBLOCK_INFO(block);
+ if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) ||
+ file.file != block->hash_link->file.file ||
+ pageno != block->hash_link->pageno)
+ {
+ DBUG_PRINT("info", ("the block 0x%lx changed => need retry"
+ "status %x files %d != %d or pages %d !=%d",
+ (ulong)block, block->status,
+ file.file, block->hash_link->file.file,
+ pageno, block->hash_link->pageno));
+ DBUG_RETURN(1);
+ }
+ }
+ /* we are doing it by global cache mutex protection, so it is OK */
+ block->wlocks++;
+ block->write_locker= user_file;
+ DBUG_PRINT("info", ("WR lock set, block 0x%lx", (ulong)block));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Remove write lock from the block
+
+ SYNOPSIS
+ release_wrlock()
+ pagecache pointer to a page cache data structure
+ block the block to work with
+
+ RETURN
+ 0 - OK
+*/
+
+static void release_wrlock(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("release_wrlock");
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->wlocks > 0);
+ DBUG_ASSERT(block->pins > 0);
+ block->wlocks--;
+ if (block->wlocks > 0)
+ DBUG_VOID_RETURN; /* Multiple write locked */
+ DBUG_PRINT("info", ("WR lock reset, block 0x%lx", (ulong)block));
+#ifdef THREAD
+ /* release all threads waiting for write lock */
+ if (block->wqueue[COND_FOR_WRLOCK].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_WRLOCK]);
+#endif
+ PCBLOCK_INFO(block);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Try to lock/unlock and pin/unpin the block
+
+ SYNOPSIS
+ make_lock_and_pin()
+ pagecache pointer to a page cache data structure
+ block the block to work with
+ lock lock change mode
+ pin pinchange mode
+ file File handler requesting pin
+
+ RETURN
+ 0 - OK
+ 1 - Try to lock the block failed
+*/
+
+static my_bool make_lock_and_pin(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ PAGECACHE_FILE *file)
+{
+ DBUG_ENTER("make_lock_and_pin");
+
+ DBUG_PRINT("enter", ("block: 0x%lx", (ulong)block));
+#ifndef DBUG_OFF
+ if (block)
+ {
+ DBUG_PRINT("enter", ("block: 0x%lx (%u) wrlocks: %u pins: %u lock: %s pin: %s",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->wlocks,
+ block->pins,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ PCBLOCK_INFO(block);
+ }
+#endif
+
+ switch (lock) {
+ case PAGECACHE_LOCK_WRITE: /* free -> write */
+ /* Writelock and pin the buffer */
+ if (get_wrlock(pagecache, block, file))
+ {
+ /* can't lock => need retry */
+ goto retry;
+ }
+
+ /* The cache is locked so nothing afraid of */
+ add_pin(block);
+ info_add_lock(block, 1);
+ break;
+ case PAGECACHE_LOCK_WRITE_TO_READ: /* write -> read */
+ case PAGECACHE_LOCK_WRITE_UNLOCK: /* write -> free */
+ /*
+ Removes write lock and puts read lock (which is nothing in our
+ implementation)
+ */
+ release_wrlock(block);
+ /* fall through */
+ case PAGECACHE_LOCK_READ_UNLOCK: /* read -> free */
+ case PAGECACHE_LOCK_LEFT_READLOCKED: /* read -> read */
+ if (pin == PAGECACHE_UNPIN)
+ {
+ remove_pin(block);
+ }
+ if (lock == PAGECACHE_LOCK_WRITE_TO_READ)
+ {
+ info_change_lock(block, 0);
+ }
+ else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_READ_UNLOCK)
+ {
+ info_remove_lock(block);
+ }
+ break;
+ case PAGECACHE_LOCK_READ: /* free -> read */
+ if (pin == PAGECACHE_PIN)
+ {
+ /* The cache is locked so nothing afraid off */
+ add_pin(block);
+ }
+ info_add_lock(block, 0);
+ break;
+ case PAGECACHE_LOCK_LEFT_UNLOCKED: /* free -> free */
+ case PAGECACHE_LOCK_LEFT_WRITELOCKED: /* write -> write */
+ break; /* do nothing */
+ default:
+ DBUG_ASSERT(0); /* Never should happened */
+ }
+
+#ifndef DBUG_OFF
+ if (block)
+ PCBLOCK_INFO(block);
+#endif
+ DBUG_RETURN(0);
+retry:
+ DBUG_PRINT("INFO", ("Retry block 0x%lx", (ulong)block));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->hash_link->requests > 0);
+ block->hash_link->requests--;
+ DBUG_ASSERT(block->requests > 0);
+ unreg_request(pagecache, block, 1);
+ PCBLOCK_INFO(block);
+ DBUG_RETURN(1);
+
+}
+
+
+/*
+ Read into a key cache block buffer from disk.
+
+ SYNOPSIS
+
+ read_block()
+ pagecache pointer to a page cache data structure
+ block block to which buffer the data is to be read
+ primary <-> the current thread will read the data
+ validator validator of read from the disk data
+ validator_data pointer to the data need by the validator
+
+ RETURN VALUE
+ None
+
+ NOTES.
+ The function either reads a page data from file to the block buffer,
+ or waits until another thread reads it. What page to read is determined
+ by a block parameter - reference to a hash link for this page.
+ If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+*/
+
+static void read_block(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ my_bool primary,
+ pagecache_disk_read_validator validator,
+ uchar* validator_data)
+{
+ uint got_length;
+
+ /* On entry cache_lock is locked */
+
+ DBUG_ENTER("read_block");
+ if (primary)
+ {
+ /*
+ This code is executed only by threads
+ that submitted primary requests
+ */
+
+ DBUG_PRINT("read_block",
+ ("page to be read by primary request"));
+
+ /* Page is not in buffer yet, is to be read from disk */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ Here other threads may step in and register as secondary readers.
+ They will register in block->wqueue[COND_FOR_REQUESTED].
+ */
+ got_length= pagecache_fread(pagecache, &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno, MYF(0));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (got_length < pagecache->block_size)
+ block->status|= PCBLOCK_ERROR;
+ else
+ block->status= PCBLOCK_READ;
+
+ if (validator != NULL &&
+ (*validator)(block->buffer, validator_data))
+ block->status|= PCBLOCK_ERROR;
+
+ DBUG_PRINT("read_block",
+ ("primary request: new page in cache"));
+ /* Signal that all pending requests for this page now can be processed */
+#ifdef THREAD
+ if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+ }
+ else
+ {
+ /*
+ This code is executed only by threads
+ that submitted secondary requests
+ */
+ DBUG_PRINT("read_block",
+ ("secondary request waiting for new page to be read"));
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ /* Put the request into a queue and wait until it can be processed */
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread);
+ do
+ {
+ DBUG_PRINT("read_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ }
+ DBUG_PRINT("read_block",
+ ("secondary request: new page in cache"));
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Set LSN on the page to the given one if the given LSN is bigger
+
+ @param pagecache pointer to a page cache data structure
+ @param lsn LSN to set
+ @param block block to check and set
+*/
+
+static void check_and_set_lsn(PAGECACHE *pagecache,
+ LSN lsn, PAGECACHE_BLOCK_LINK *block)
+{
+ LSN old;
+ DBUG_ENTER("check_and_set_lsn");
+ DBUG_ASSERT(block->type == PAGECACHE_LSN_PAGE);
+ old= lsn_korr(block->buffer + PAGE_LSN_OFFSET);
+ DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)",
+ LSN_IN_PARTS(old), LSN_IN_PARTS(lsn)));
+ if (cmp_translog_addr(lsn, old) > 0)
+ {
+
+ DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE);
+ lsn_store(block->buffer + PAGE_LSN_OFFSET, lsn);
+ /* we stored LSN in page so we dirtied it */
+ if (!(block->status & PCBLOCK_CHANGED))
+ link_to_changed_list(pagecache, block);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unlock/unpin page and put LSN stamp if it need
+
+ SYNOPSIS
+ pagecache_unlock()
+ pagecache pointer to a page cache data structure
+ file handler for the file for the block of data to be read
+ pageno number of the block of data in the file
+ lock lock change
+ pin pin page
+ first_REDO_LSN_for_page do not set it if it is zero
+ lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+
+ NOTE
+ Pininig uses requests registration mechanism it works following way:
+ | beginnig | ending |
+ | of func. | of func. |
+ ----------------------------+-------------+---------------+
+ PAGECACHE_PIN_LEFT_PINNED | - | - |
+ PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request |
+ PAGECACHE_PIN | reg request | - |
+ PAGECACHE_UNPIN | - | unreg request |
+
+
+*/
+
+void pagecache_unlock(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn)
+{
+ PAGECACHE_BLOCK_LINK *block;
+ int page_st;
+ DBUG_ENTER("pagecache_unlock");
+ DBUG_PRINT("enter", ("fd: %u page: %lu %s %s",
+ (uint) file->file, (ulong) pageno,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ /* we do not allow any lock/pin increasing here */
+ DBUG_ASSERT(pin != PAGECACHE_PIN);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ to unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ /* See NOTE for pagecache_unlock about registering requests */
+ block= find_block(pagecache, file, pageno, 0, 0,
+ test(pin == PAGECACHE_PIN_LEFT_UNPINNED), &page_st);
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block != 0 && page_st == PAGE_READ);
+ if (first_REDO_LSN_for_page)
+ {
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK);
+ DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+ if (block->rec_lsn == LSN_MAX)
+ block->rec_lsn= first_REDO_LSN_for_page;
+ else
+ DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
+ first_REDO_LSN_for_page) <= 0);
+
+ }
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ if (make_lock_and_pin(pagecache, block, lock, pin, file))
+ {
+ DBUG_ASSERT(0); /* should not happend */
+ }
+
+ remove_reader(block);
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (pin != PAGECACHE_PIN_LEFT_PINNED)
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unpin page
+
+ SYNOPSIS
+ pagecache_unpin()
+ pagecache pointer to a page cache data structure
+ file handler for the file for the block of data to be read
+ pageno number of the block of data in the file
+ lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+*/
+
+void pagecache_unpin(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ LSN lsn)
+{
+ PAGECACHE_BLOCK_LINK *block;
+ int page_st;
+ DBUG_ENTER("pagecache_unpin");
+ DBUG_PRINT("enter", ("fd: %u page: %lu",
+ (uint) file->file, (ulong) pageno));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock bacause want
+ aunlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ /* See NOTE for pagecache_unlock about registering requests */
+ block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st);
+ DBUG_ASSERT(block != 0);
+ DBUG_ASSERT(page_st == PAGE_READ);
+
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ /*
+ we can just unpin only with keeping read lock because:
+ a) we can't pin without any lock
+ b) we can't unpin keeping write lock
+ */
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_LEFT_READLOCKED,
+ PAGECACHE_UNPIN, file))
+ DBUG_ASSERT(0); /* should not happend */
+
+ remove_reader(block);
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests
+ */
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unlock/unpin page and put LSN stamp if it need
+ (uses direct block/page pointer)
+
+ SYNOPSIS
+ pagecache_unlock_by_link()
+ pagecache pointer to a page cache data structure
+ link direct link to page (returned by read or write)
+ lock lock change
+ pin pin page
+ first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0)
+ lsn if it is not LSN_IMPOSSIBLE and it is bigger then
+ LSN on the page it will be written on the page
+*/
+
+void pagecache_unlock_by_link(PAGECACHE *pagecache,
+ PAGECACHE_PAGE_LINK *link,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn)
+{
+ PAGECACHE_BLOCK_LINK *block= (PAGECACHE_BLOCK_LINK *)link;
+ DBUG_ENTER("pagecache_unlock_by_link");
+ DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu %s %s",
+ (ulong) block,
+ (uint) block->hash_link->file.file,
+ (ulong) block->hash_link->pageno,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ /*
+ We do not allow any lock/pin increasing here and page can't be
+ unpinned because we use direct link.
+ */
+ DBUG_ASSERT(pin != PAGECACHE_PIN);
+ DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+ if (pin == PAGECACHE_PIN_LEFT_UNPINNED &&
+ lock == PAGECACHE_LOCK_READ_UNLOCK)
+ {
+ /* block do not need here so we do not provide it */
+ if (make_lock_and_pin(pagecache, 0, lock, pin, 0))
+ DBUG_ASSERT(0); /* should not happend */
+ DBUG_VOID_RETURN;
+ }
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE)
+ {
+ /*
+ LOCK_READ_UNLOCK is ok here as the page may have first locked
+ with WRITE lock that was temporarly converted to READ lock before
+ it's unpinned
+ */
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_READ_UNLOCK);
+ DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+ if (block->rec_lsn == LSN_MAX)
+ block->rec_lsn= first_REDO_LSN_for_page;
+ else
+ DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
+ first_REDO_LSN_for_page) <= 0);
+ }
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ if (make_lock_and_pin(pagecache, block, lock, pin, 0))
+ DBUG_ASSERT(0); /* should not happend */
+
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (pin != PAGECACHE_PIN_LEFT_PINNED)
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unpin page
+ (uses direct block/page pointer)
+
+ SYNOPSIS
+ pagecache_unpin_by_link()
+ pagecache pointer to a page cache data structure
+ link direct link to page (returned by read or write)
+ lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+*/
+
+void pagecache_unpin_by_link(PAGECACHE *pagecache,
+ PAGECACHE_PAGE_LINK *link,
+ LSN lsn)
+{
+ PAGECACHE_BLOCK_LINK *block= (PAGECACHE_BLOCK_LINK *)link;
+ DBUG_ENTER("pagecache_unpin_by_link");
+ DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu",
+ (ulong) block,
+ (uint) block->hash_link->file.file,
+ (ulong) block->hash_link->pageno));
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ /*
+ We can just unpin only with keeping read lock because:
+ a) we can't pin without any lock
+ b) we can't unpin keeping write lock
+ */
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_LEFT_READLOCKED,
+ PAGECACHE_UNPIN, 0))
+ DBUG_ASSERT(0); /* should not happend */
+
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Read a block of data from a cached file into a buffer;
+
+ SYNOPSIS
+ pagecache_valid_read()
+ pagecache pointer to a page cache data structure
+ file handler for the file for the block of data to be read
+ pageno number of the block of data in the file
+ level determines the weight of the data
+ buff buffer to where the data must be placed
+ type type of the page
+ lock lock change
+ link link to the page if we pin it
+ validator validator of read from the disk data
+ validator_data pointer to the data need by the validator
+
+ RETURN VALUE
+ Returns address from where the data is placed if successful, 0 - otherwise.
+
+ Pin will be chosen according to lock parameter (see lock_to_pin)
+*/
+static enum pagecache_page_pin lock_to_pin[]=
+{
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+ PAGECACHE_PIN_LEFT_PINNED /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/,
+ PAGECACHE_PIN /*PAGECACHE_LOCK_WRITE*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/,
+ PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+ PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+uchar *pagecache_valid_read(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ PAGECACHE_PAGE_LINK *link,
+ pagecache_disk_read_validator validator,
+ uchar* validator_data)
+{
+ int error= 0;
+ enum pagecache_page_pin pin= lock_to_pin[lock];
+ PAGECACHE_PAGE_LINK fake_link;
+ DBUG_ENTER("pagecache_valid_read");
+ DBUG_PRINT("enter", ("fd: %u page: %lu level: %u t:%s %s %s",
+ (uint) file->file, (ulong) pageno, level,
+ page_cache_page_type_str[type],
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+
+ if (!link)
+ link= &fake_link;
+ else
+ *link= 0;
+
+restart:
+
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ PAGECACHE_BLOCK_LINK *block;
+ uint status;
+ int page_st;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ {
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ goto no_key_cache;
+ }
+
+ inc_counter_for_resize_op(pagecache);
+ pagecache->global_cache_r_requests++;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ block= find_block(pagecache, file, pageno, level,
+ test(lock == PAGECACHE_LOCK_WRITE),
+ test((pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+ (pin == PAGECACHE_PIN)),
+ &page_st);
+ DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+ block->type == type ||
+ type == PAGECACHE_LSN_PAGE ||
+ type == PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == PAGECACHE_READ_UNKNOWN_PAGE);
+ if (type != PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == PAGECACHE_EMPTY_PAGE)
+ block->type= type;
+ if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ))
+ {
+ DBUG_PRINT("info", ("read block 0x%lx", (ulong)block));
+ /* The requested page is to be read into the block buffer */
+ read_block(pagecache, block,
+ (my_bool)(page_st == PAGE_TO_BE_READ),
+ validator, validator_data);
+ DBUG_PRINT("info", ("read is done"));
+ }
+ if (make_lock_and_pin(pagecache, block, lock, pin, file))
+ {
+ /*
+ We failed to write lock the block, cache is unlocked,
+ we will try to get the block again.
+ */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ if (! ((status= block->status) & PCBLOCK_ERROR))
+ {
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+#endif
+
+ DBUG_ASSERT((pagecache->block_size & 511) == 0);
+ /* Copy data from the cache buffer */
+ bmove512(buff, block->buffer, pagecache->block_size);
+
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+#endif
+ }
+
+ remove_reader(block);
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN)
+ unreg_request(pagecache, block, 1);
+ else
+ *link= (PAGECACHE_PAGE_LINK)block;
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ if (status & PCBLOCK_ERROR)
+ DBUG_RETURN((uchar *) 0);
+
+ DBUG_RETURN(buff);
+ }
+
+no_key_cache: /* Key cache is not used */
+
+ /* We can't use mutex here as the key cache may not be initialized */
+ pagecache->global_cache_r_requests++;
+ pagecache->global_cache_read++;
+ if (pagecache_fread(pagecache, file, (uchar*) buff, pageno, MYF(MY_NABP)))
+ error= 1;
+ DBUG_RETURN(error ? (uchar*) 0 : buff);
+}
+
+
+/*
+ Delete page from the buffer
+
+ SYNOPSIS
+ pagecache_delete()
+ pagecache pointer to a page cache data structure
+ file handler for the file for the block of data to be read
+ pageno number of the block of data in the file
+ lock lock change
+ flush flush page if it is dirty
+
+ RETURN VALUE
+ 0 - deleted or was not present at all
+ 1 - error
+
+ NOTES.
+ lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was write locked
+ before) or PAGECACHE_LOCK_WRITE (delete will write lock page before delete)
+*/
+my_bool pagecache_delete(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ my_bool flush)
+{
+ int error= 0;
+ enum pagecache_page_pin pin= lock_to_pin[lock];
+ DBUG_ENTER("pagecache_delete");
+ DBUG_PRINT("enter", ("fd: %u page: %lu %s %s",
+ (uint) file->file, (ulong) pageno,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED);
+ DBUG_ASSERT(pin == PAGECACHE_PIN ||
+ pin == PAGECACHE_PIN_LEFT_PINNED);
+
+restart:
+
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ reg1 PAGECACHE_BLOCK_LINK *block;
+ PAGECACHE_HASH_LINK **unused_start, *link;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ goto end;
+
+ inc_counter_for_resize_op(pagecache);
+ link= get_present_hash_link(pagecache, file, pageno, &unused_start);
+ if (!link)
+ {
+ DBUG_PRINT("info", ("There is no such page in the cache"));
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(0);
+ }
+ block= link->block;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ if (pin == PAGECACHE_PIN)
+ reg_requests(pagecache, block, 1);
+ DBUG_ASSERT(block != 0);
+ if (make_lock_and_pin(pagecache, block, lock, pin, file))
+ {
+ /*
+ We failed to writelock the block, cache is unlocked, and last write
+ lock is released, we will try to get the block again.
+ */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ if (flush)
+ {
+ /* The block contains a dirty page - push it out of the cache */
+
+ KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ The call is thread safe because only the current
+ thread might change the block->hash_link value
+ */
+ DBUG_ASSERT(block->pins == 1);
+ error= pagecache_fwrite(pagecache,
+ &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ MYF(MY_NABP | MY_WAIT_IF_FULL));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ pagecache->global_cache_write++;
+
+ if (error)
+ {
+ block->status|= PCBLOCK_ERROR;
+ goto err;
+ }
+ }
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ /*
+ free_block() will change the status and rec_lsn of the block so no
+ need to change them here.
+ */
+ }
+ /* Cache is locked, so we can relese page before freeing it */
+ make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, file);
+ DBUG_ASSERT(link->requests > 0);
+ link->requests--;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ free_block(pagecache, block);
+
+err:
+ dec_counter_for_resize_op(pagecache);
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint page_count,
+ enum pagecache_page_lock lock,
+ my_bool flush)
+{
+ ulong page_end;
+ DBUG_ENTER("pagecache_delete_pages");
+ DBUG_ASSERT(page_count > 0);
+
+ page_end= pageno + page_count;
+ do
+ {
+ if (pagecache_delete(pagecache, file, pageno,
+ lock, flush))
+ DBUG_RETURN(1);
+ } while (++pageno != page_end);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write a buffer into a cached file.
+
+ SYNOPSIS
+
+ pagecache_write_part()
+ pagecache pointer to a page cache data structure
+ file handler for the file to write data to
+ pageno number of the block of data in the file
+ level determines the weight of the data
+ buff buffer with the data
+ type type of the page
+ lock lock change
+ pin pin page
+ write_mode how to write page
+ link link to the page if we pin it
+
+ RETURN VALUE
+ 0 if a success, 1 - otherwise.
+*/
+
+/* description of how to change lock before and after write */
+struct write_lock_change
+{
+ int need_lock_change; /* need changing of lock at the end of write */
+ enum pagecache_page_lock new_lock; /* lock at the beginning */
+ enum pagecache_page_lock unlock_lock; /* lock at the end */
+};
+
+static struct write_lock_change write_lock_change_table[]=
+{
+ {1,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+ {0, /*unsupported (we can't write having the block read locked) */
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+ {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+ {1,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/,
+ {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/,
+ {0, /*unsupported (we can't write having the block read locked) */
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/,
+ {1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+ {1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+/* description of how to change pin before and after write */
+struct write_pin_change
+{
+ enum pagecache_page_pin new_pin; /* pin status at the beginning */
+ enum pagecache_page_pin unlock_pin; /* pin status at the end */
+};
+
+static struct write_pin_change write_pin_change_table[]=
+{
+ {PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/,
+ {PAGECACHE_PIN,
+ PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/,
+ {PAGECACHE_PIN,
+ PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/,
+ {PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/
+};
+
+my_bool pagecache_write_part(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ enum pagecache_write_mode write_mode,
+ PAGECACHE_PAGE_LINK *link,
+ uint offset, uint size,
+ pagecache_disk_read_validator validator,
+ uchar* validator_data)
+{
+ PAGECACHE_BLOCK_LINK *block= NULL;
+ PAGECACHE_PAGE_LINK fake_link;
+ int error= 0;
+ int need_lock_change= write_lock_change_table[lock].need_lock_change;
+ DBUG_ENTER("pagecache_write_part");
+ DBUG_PRINT("enter", ("fd: %u page: %lu level: %u type: %s lock: %s "
+ "pin: %s mode: %s offset: %u size %u",
+ (uint) file->file, (ulong) pageno, level,
+ page_cache_page_type_str[type],
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin],
+ page_cache_page_write_mode_str[write_mode],
+ offset, size));
+ DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK);
+ DBUG_ASSERT(offset + size <= pagecache->block_size);
+ if (!link)
+ link= &fake_link;
+ else
+ *link= 0;
+
+restart:
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "start of key_cache_write", 1););
+#endif
+
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ int page_st;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ {
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ goto no_key_cache;
+ }
+
+ inc_counter_for_resize_op(pagecache);
+ pagecache->global_cache_w_requests++;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ block= find_block(pagecache, file, pageno, level,
+ test(write_mode != PAGECACHE_WRITE_DONE &&
+ lock != PAGECACHE_LOCK_LEFT_WRITELOCKED &&
+ lock != PAGECACHE_LOCK_WRITE_UNLOCK &&
+ lock != PAGECACHE_LOCK_WRITE_TO_READ),
+ test((pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+ (pin == PAGECACHE_PIN)),
+ &page_st);
+ if (!block)
+ {
+ DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE);
+ /* It happens only for requests submitted during resize operation */
+ dec_counter_for_resize_op(pagecache);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /* Write to the disk key cache is in resize at the moment*/
+ goto no_key_cache;
+ }
+
+ DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+ block->type == PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == type ||
+ (block->type == PAGECACHE_PLAIN_PAGE &&
+ type == PAGECACHE_LSN_PAGE));
+ block->type= type;
+
+ if (make_lock_and_pin(pagecache, block,
+ write_lock_change_table[lock].new_lock,
+ (need_lock_change ?
+ write_pin_change_table[pin].new_pin :
+ pin), file))
+ {
+ /*
+ We failed to writelock the block, cache is unlocked, and last write
+ lock is released, we will try to get the block again.
+ */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ if (write_mode == PAGECACHE_WRITE_DONE)
+ {
+ if (!(block->status & PCBLOCK_ERROR))
+ {
+ /* Copy data from buff */
+ if (!(size & 511))
+ bmove512(block->buffer + offset, buff, size);
+ else
+ memcpy(block->buffer + offset, buff, size);
+ block->status= PCBLOCK_READ;
+ /*
+ The validator can change the page content (removing page
+ protection) so it have to be called
+ */
+ if (validator != NULL &&
+ (*validator)(block->buffer, validator_data))
+ block->status|= PCBLOCK_ERROR;
+ KEYCACHE_DBUG_PRINT("key_cache_insert",
+ ("Page injection"));
+#ifdef THREAD
+ /* Signal that all pending requests for this now can be processed. */
+ if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+ }
+ }
+ else
+ {
+ DBUG_ASSERT(validator == 0 && validator_data == 0);
+ if (! (block->status & PCBLOCK_CHANGED))
+ link_to_changed_list(pagecache, block);
+
+ if (! (block->status & PCBLOCK_ERROR))
+ {
+ if (!(size & 511))
+ bmove512(block->buffer + offset, buff, size);
+ else
+ memcpy(block->buffer + offset, buff, size);
+ block->status|= PCBLOCK_READ;
+ }
+ }
+
+ if (need_lock_change)
+ {
+ /*
+ We don't set rec_lsn of the block; this is ok as for the
+ Maria-block-record's pages, we always keep pages pinned here.
+ */
+ if (make_lock_and_pin(pagecache, block,
+ write_lock_change_table[lock].unlock_lock,
+ write_pin_change_table[pin].unlock_pin, file))
+ DBUG_ASSERT(0);
+ }
+
+ /* Unregister the request */
+ DBUG_ASSERT(block->hash_link->requests > 0);
+ block->hash_link->requests--;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN)
+ unreg_request(pagecache, block, 1);
+ else
+ *link= (PAGECACHE_PAGE_LINK)block;
+
+ if (block->status & PCBLOCK_ERROR)
+ error= 1;
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ goto end;
+ }
+
+no_key_cache:
+ /* Key cache is not used */
+ if (write_mode == PAGECACHE_WRITE_DELAY)
+ {
+ pagecache->global_cache_w_requests++;
+ pagecache->global_cache_write++;
+ if (pagecache_fwrite(pagecache, file, (uchar*) buff, pageno, type,
+ MYF(MY_NABP | MY_WAIT_IF_FULL)))
+ error=1;
+ }
+
+end:
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("exec",
+ test_key_cache(pagecache, "end of key_cache_write", 1););
+#endif
+ if (block)
+ PCBLOCK_INFO(block);
+ else
+ DBUG_PRINT("info", ("No block"));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Free block: remove reference to it from hash table,
+ remove it from the chain file of dirty/clean blocks
+ and add it to the free list.
+*/
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+ KEYCACHE_THREAD_TRACE("free block");
+ KEYCACHE_DBUG_PRINT("free_block",
+ ("block: %u hash_link 0x%lx",
+ PCBLOCK_NUMBER(pagecache, block),
+ (long) block->hash_link));
+ if (block->hash_link)
+ {
+ /*
+ While waiting for readers to finish, new readers might request the
+ block. But since we set block->status|= PCBLOCK_REASSIGNED, they
+ will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
+ later.
+ */
+ block->status|= PCBLOCK_REASSIGNED;
+ wait_for_readers(pagecache, block);
+ unlink_hash(pagecache, block->hash_link);
+ }
+
+ unlink_changed(block);
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status= 0;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+ block->rec_lsn= LSN_MAX;
+ KEYCACHE_THREAD_TRACE("free block");
+ KEYCACHE_DBUG_PRINT("free_block",
+ ("block is freed"));
+ unreg_request(pagecache, block, 0);
+ block->hash_link= NULL;
+
+ /* Remove the free block from the LRU ring. */
+ unlink_block(pagecache, block);
+ if (block->temperature == PCBLOCK_WARM)
+ pagecache->warm_blocks--;
+ block->temperature= PCBLOCK_COLD;
+ /* Insert the free block in the free list. */
+ block->next_used= pagecache->free_block_list;
+ pagecache->free_block_list= block;
+ /* Keep track of the number of currently unused blocks. */
+ pagecache->blocks_unused++;
+
+#ifdef THREAD
+ /* All pending requests for this page must be resubmitted. */
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+}
+
+
+static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b)
+{
+ return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 :
+ ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0);
+}
+
+
+/*
+ Flush a portion of changed blocks to disk,
+ free used blocks if requested
+*/
+
+static int flush_cached_blocks(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ PAGECACHE_BLOCK_LINK **cache,
+ PAGECACHE_BLOCK_LINK **end,
+ enum flush_type type)
+{
+ int error;
+ int last_errno= 0;
+ uint count= (uint) (end-cache);
+ DBUG_ENTER("flush_cached_blocks");
+
+ /* Don't lock the cache during the flush */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH
+ we are guarantied no thread will change them
+ */
+ qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ for (; cache != end; cache++)
+ {
+ PAGECACHE_BLOCK_LINK *block= *cache;
+
+ if (block->pins)
+ {
+ KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+ ("block: %u (0x%lx) pinned",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ DBUG_PRINT("info", ("block: %u (0x%lx) pinned",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ PCBLOCK_INFO(block);
+ last_errno= -1;
+ unreg_request(pagecache, block, 1);
+ continue;
+ }
+ /* if the block is not pinned then it is not write locked */
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->pins == 0);
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_WRITE, PAGECACHE_PIN, 0))
+ DBUG_ASSERT(0);
+
+ KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+ ("block: %u (0x%lx) to be flushed",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ DBUG_PRINT("info", ("block: %u (0x%lx) to be flushed",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ PCBLOCK_INFO(block);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("block: %u (0x%lx) pins: %u",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block,
+ block->pins));
+ DBUG_ASSERT(block->pins == 1);
+ error= pagecache_fwrite(pagecache, file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ MYF(MY_NABP | MY_WAIT_IF_FULL));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+ make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, 0);
+
+ pagecache->global_cache_write++;
+ if (error)
+ {
+ block->status|= PCBLOCK_ERROR;
+ if (!last_errno)
+ last_errno= errno ? errno : -1;
+ }
+#ifdef THREAD
+ /*
+ Let to proceed for possible waiting requests to write to the block page.
+ It might happen only during an operation to resize the key cache.
+ */
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+ /* type will never be FLUSH_IGNORE_CHANGED here */
+ if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
+ {
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ free_block(pagecache, block);
+ }
+ else
+ {
+ block->status&= ~PCBLOCK_IN_FLUSH;
+ link_to_file_list(pagecache, block, file, 1);
+ unreg_request(pagecache, block, 1);
+ }
+ }
+ DBUG_RETURN(last_errno);
+}
+
+
+/**
+ @brief flush all key blocks for a file to disk but don't do any mutex locks
+
+ @param pagecache pointer to a pagecache data structure
+ @param file handler for the file to flush to
+ @param flush_type type of the flush
+
+ @note
+ This function doesn't do any mutex locks because it needs to be called
+ both from flush_pagecache_blocks and flush_all_key_blocks (the later one
+ does the mutex lock in the resize_pagecache() function).
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ enum flush_type type)
+{
+ PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
+ int last_errno= 0;
+ DBUG_ENTER("flush_pagecache_blocks_int");
+ DBUG_PRINT("enter",("file: %d blocks_used: %lu blocks_changed: %lu",
+ file->file, pagecache->blocks_used, pagecache->blocks_changed));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache,
+ "start of flush_pagecache_blocks", 0););
+#endif
+
+ cache= cache_buff;
+ if (pagecache->disk_blocks > 0 &&
+ (!my_disable_flush_pagecache_blocks || type != FLUSH_KEEP))
+ {
+ /* Key cache exists and flush is not disabled */
+ int error= 0;
+ uint count= 0;
+ PAGECACHE_BLOCK_LINK **pos, **end;
+ PAGECACHE_BLOCK_LINK *first_in_switch= NULL;
+ PAGECACHE_BLOCK_LINK *block, *next;
+#if defined(PAGECACHE_DEBUG)
+ uint cnt= 0;
+#endif
+ uint8 changed_blocks_is_incomplete_incremented= 0;
+
+ if (type != FLUSH_IGNORE_CHANGED)
+ {
+ /*
+ Count how many key blocks we have to cache to be able
+ to flush all dirty pages with minimum seek moves
+ */
+ for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+ block;
+ block= block->next_changed)
+ {
+ if (block->hash_link->file.file == file->file)
+ {
+ count++;
+ KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used);
+ }
+ }
+ /* Allocate a new buffer only if its bigger than the one we have */
+ if (count > FLUSH_CACHE &&
+ !(cache=
+ (PAGECACHE_BLOCK_LINK**)
+ my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0))))
+ {
+ cache= cache_buff;
+ count= FLUSH_CACHE;
+ }
+ }
+
+ /* Retrieve the blocks and write them to a buffer to be flushed */
+restart:
+ end= (pos= cache)+count;
+ for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+ block;
+ block= next)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ next= block->next_changed;
+ if (block->hash_link->file.file == file->file)
+ {
+ /*
+ Mark the block with BLOCK_IN_FLUSH in order not to let
+ other threads to use it for new pages and interfere with
+ our sequence of flushing dirty file pages
+ */
+ block->status|= PCBLOCK_IN_FLUSH;
+
+ if (! (block->status & PCBLOCK_IN_SWITCH))
+ {
+ /*
+ We care only for the blocks for which flushing was not
+ initiated by other threads as a result of page swapping
+ */
+ reg_requests(pagecache, block, 1);
+ if (type != FLUSH_IGNORE_CHANGED)
+ {
+ /* It's not a temporary file */
+ if (pos == end)
+ {
+ /*
+ This happens only if there is not enough
+ memory for the big block
+ */
+ if ((error= flush_cached_blocks(pagecache, file, cache,
+ end,type)))
+ last_errno=error;
+ DBUG_PRINT("info", ("restarting..."));
+ /*
+ Restart the scan as some other thread might have changed
+ the changed blocks chain: the blocks that were in switch
+ state before the flush started have to be excluded
+ */
+ goto restart;
+ }
+ *pos++= block;
+ }
+ else
+ {
+ /* It's a temporary file */
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ free_block(pagecache, block);
+ }
+ }
+ else
+ {
+ /* Link the block into a list of blocks 'in switch' */
+ unlink_changed(block);
+ link_changed(block, &first_in_switch);
+ /*
+ We have just removed a page from the list of dirty pages
+ ("changed_blocks") though it's still dirty (the flush by another
+ thread has not yet happened). Checkpoint will miss the page and so
+ must be blocked until that flush has happened.
+ Note that if there are two concurrent
+ flush_pagecache_blocks_int() on this file, then the first one may
+ move the block into its first_in_switch, and the second one would
+ just not see the block and wrongly consider its job done.
+ @todo RECOVERY Maria does protect such flushes with intern_lock,
+ but Checkpoint does not (Checkpoint makes sure that
+ changed_blocks_is_incomplete is 0 when it starts, but as
+ flush_cached_blocks() releases mutex, this may change...
+ */
+ /**
+ @todo RECOVERY: check all places where we remove a page from the
+ list of dirty pages
+ */
+ if (unlikely(!changed_blocks_is_incomplete_incremented))
+ {
+ changed_blocks_is_incomplete_incremented= 1;
+ changed_blocks_is_incomplete++;
+ }
+ }
+ }
+ }
+ if (pos != cache)
+ {
+ if ((error= flush_cached_blocks(pagecache, file, cache, pos, type)))
+ last_errno= error;
+ }
+ /* Wait until list of blocks in switch is empty */
+ while (first_in_switch)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt= 0;
+#endif
+ block= first_in_switch;
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ }
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ }
+ changed_blocks_is_incomplete-=
+ changed_blocks_is_incomplete_incremented;
+ /* The following happens very seldom */
+ if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt=0;
+#endif
+ for (block= pagecache->file_blocks[FILE_HASH(*file)] ;
+ block;
+ block= next)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ next= block->next_changed;
+ if (block->hash_link->file.file == file->file &&
+ (! (block->status & PCBLOCK_CHANGED)
+ || type == FLUSH_IGNORE_CHANGED))
+ {
+ reg_requests(pagecache, block, 1);
+ free_block(pagecache, block);
+ }
+ }
+ }
+ }
+
+#ifndef DBUG_OFF
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "end of flush_pagecache_blocks", 0););
+#endif
+ if (cache != cache_buff)
+ my_free((uchar*) cache, MYF(0));
+ if (last_errno)
+ errno=last_errno; /* Return first error */
+ DBUG_RETURN(last_errno != 0);
+}
+
+
+/*
+ Flush all blocks for a file to disk
+
+ SYNOPSIS
+
+ flush_pagecache_blocks()
+ pagecache pointer to a page cache data structure
+ file handler for the file to flush to
+ flush_type type of the flush
+
+ RETURN
+ 0 OK
+ 1 error
+*/
+
+int flush_pagecache_blocks(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file, enum flush_type type)
+{
+ int res;
+ DBUG_ENTER("flush_pagecache_blocks");
+ DBUG_PRINT("enter", ("pagecache: 0x%lx", (long) pagecache));
+
+ if (pagecache->disk_blocks <= 0)
+ DBUG_RETURN(0);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ inc_counter_for_resize_op(pagecache);
+ res= flush_pagecache_blocks_int(pagecache, file, type);
+ dec_counter_for_resize_op(pagecache);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Reset the counters of a key cache.
+
+ SYNOPSIS
+ reset_pagecache_counters()
+ name the name of a key cache
+ pagecache pointer to the pagecache to be reset
+
+ DESCRIPTION
+ This procedure is used to reset the counters of all currently used key
+ caches, both the default one and the named ones.
+
+ RETURN
+ 0 on success (always because it can't fail)
+*/
+
+int reset_pagecache_counters(const char *name, PAGECACHE *pagecache)
+{
+ DBUG_ENTER("reset_pagecache_counters");
+ if (!pagecache->inited)
+ {
+ DBUG_PRINT("info", ("Key cache %s not initialized.", name));
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
+
+ pagecache->global_blocks_changed= 0; /* Key_blocks_not_flushed */
+ pagecache->global_cache_r_requests= 0; /* Key_read_requests */
+ pagecache->global_cache_read= 0; /* Key_reads */
+ pagecache->global_cache_w_requests= 0; /* Key_write_requests */
+ pagecache->global_cache_write= 0; /* Key_writes */
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Allocates a buffer and stores in it some info about all dirty pages
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
+ are not interesting for a checkpoint record.
+ The caller has the intention of doing checkpoints.
+
+ @param pagecache pointer to the page cache
+ @param[out] str pointer to where the allocated buffer, and
+ its size, will be put
+ @param[out] min_rec_lsn pointer to where the minimum rec_lsn of all
+ relevant dirty pages will be put
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *min_rec_lsn)
+{
+ my_bool error= 0;
+ uint stored_list_size= 0;
+ uint file_hash;
+ char *ptr;
+ LSN minimum_rec_lsn= LSN_MAX;
+ DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
+
+ DBUG_ASSERT(NULL == str->str);
+ /*
+ We lock the entire cache but will be quick, just reading/writing a few MBs
+ of memory at most.
+ */
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ while (changed_blocks_is_incomplete > 0)
+ {
+ /*
+ Some pages are more recent in memory than on disk (=dirty) and are not
+ in "changed_blocks" so we cannot know them. Wait.
+ */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ sleep(1);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ }
+
+ /* Count how many dirty pages are interesting */
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ /*
+ Q: is there something subtle with block->hash_link: can it be NULL?
+ does it have to be == hash_link->block... ?
+ */
+ DBUG_ASSERT(block->hash_link != NULL);
+ DBUG_ASSERT(block->status & PCBLOCK_CHANGED);
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it */
+ stored_list_size++;
+ }
+ }
+
+ compile_time_assert(sizeof(pagecache->blocks == 4));
+ str->length= 4 + /* number of dirty pages */
+ (4 + /* file */
+ 4 + /* pageno */
+ LSN_STORE_SIZE /* rec_lsn */
+ ) * stored_list_size;
+ if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+ goto err;
+ ptr= str->str;
+ int4store(ptr, stored_list_size);
+ ptr+= 4;
+ if (!stored_list_size)
+ goto end;
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it in the checkpoint record */
+ compile_time_assert((4 == sizeof(block->hash_link->file.file)));
+ compile_time_assert((4 == sizeof(block->hash_link->pageno)));
+ int4store(ptr, block->hash_link->file.file);
+ ptr+= 4;
+ int4store(ptr, block->hash_link->pageno);
+ ptr+= 4;
+ lsn_store(ptr, block->rec_lsn);
+ ptr+= LSN_STORE_SIZE;
+ if (block->rec_lsn != LSN_MAX)
+ {
+ DBUG_ASSERT(LSN_VALID(block->rec_lsn));
+ if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0)
+ minimum_rec_lsn= block->rec_lsn;
+ } /* otherwise, some trn->rec_lsn should hold the correct info */
+ }
+ }
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ *min_rec_lsn= minimum_rec_lsn;
+ DBUG_RETURN(error);
+
+err:
+ error= 1;
+ goto end;
+}
+
+
+#ifndef DBUG_OFF
+/*
+ Test if disk-cache is ok
+*/
+static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)),
+ const char *where __attribute__((unused)),
+ my_bool lock __attribute__((unused)))
+{
+ /* TODO */
+}
+#endif
+
+#if defined(PAGECACHE_TIMEOUT)
+
+#define KEYCACHE_DUMP_FILE "pagecache_dump.txt"
+#define MAX_QUEUE_LEN 100
+
+
+static void pagecache_dump(PAGECACHE *pagecache)
+{
+ FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w");
+ struct st_my_thread_var *last;
+ struct st_my_thread_var *thread;
+ PAGECACHE_BLOCK_LINK *block;
+ PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_PAGE *page;
+ uint i;
+
+ fprintf(pagecache_dump_file, "thread:%u\n", thread->id);
+
+ i=0;
+ thread=last=waiting_for_hash_link.last_thread;
+ fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n");
+ if (thread)
+ do
+ {
+ thread= thread->next;
+ page= (PAGECACHE_PAGE *) thread->opt_info;
+ fprintf(pagecache_dump_file,
+ "thread:%u, (file,pageno)=(%u,%lu)\n",
+ thread->id,(uint) page->file.file,(ulong) page->pageno);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+
+ i=0;
+ thread=last=waiting_for_block.last_thread;
+ fprintf(pagecache_dump_file, "queue of threads waiting for block\n");
+ if (thread)
+ do
+ {
+ thread=thread->next;
+ hash_link= (PAGECACHE_HASH_LINK *) thread->opt_info;
+ fprintf(pagecache_dump_file,
+ "thread:%u hash_link:%u (file,pageno)=(%u,%lu)\n",
+ thread->id, (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link),
+ (uint) hash_link->file.file,(ulong) hash_link->pageno);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+
+ for (i=0 ; i < pagecache->blocks_used ; i++)
+ {
+ int j;
+ block= &pagecache->block_root[i];
+ hash_link= block->hash_link;
+ fprintf(pagecache_dump_file,
+ "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n",
+ i, (int) (hash_link ?
+ PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) :
+ -1),
+ block->status, block->requests, block->condvar ? 1 : 0);
+ for (j=0 ; j < COND_SIZE; j++)
+ {
+ PAGECACHE_WQUEUE *wqueue=&block->wqueue[j];
+ thread= last= wqueue->last_thread;
+ fprintf(pagecache_dump_file, "queue #%d\n", j);
+ if (thread)
+ {
+ do
+ {
+ thread=thread->next;
+ fprintf(pagecache_dump_file,
+ "thread:%u\n", thread->id);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+ }
+ }
+ }
+ fprintf(pagecache_dump_file, "LRU chain:");
+ block= pagecache= used_last;
+ if (block)
+ {
+ do
+ {
+ block= block->next_used;
+ fprintf(pagecache_dump_file,
+ "block:%u, ", PCBLOCK_NUMBER(pagecache, block));
+ }
+ while (block != pagecache->used_last);
+ }
+ fprintf(pagecache_dump_file, "\n");
+
+ fclose(pagecache_dump_file);
+}
+
+#endif /* defined(PAGECACHE_TIMEOUT) */
+
+#if defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)
+
+
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex)
+{
+ int rc;
+ struct timeval now; /* time when we started waiting */
+ struct timespec timeout; /* timeout value for the wait function */
+ struct timezone tz;
+#if defined(PAGECACHE_DEBUG)
+ int cnt=0;
+#endif
+
+ /* Get current time */
+ gettimeofday(&now, &tz);
+ /* Prepare timeout value */
+ timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT;
+ /*
+ timeval uses microseconds.
+ timespec uses nanoseconds.
+ 1 nanosecond = 1000 micro seconds
+ */
+ timeout.tv_nsec= now.tv_usec * 1000;
+ KEYCACHE_THREAD_TRACE_END("started waiting");
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ if (cnt % 100 == 0)
+ fprintf(pagecache_debug_log, "waiting...\n");
+ fflush(pagecache_debug_log);
+#endif
+ rc= pthread_cond_timedwait(cond, mutex, &timeout);
+ KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+ if (rc == ETIMEDOUT || rc == ETIME)
+ {
+#if defined(PAGECACHE_DEBUG)
+ fprintf(pagecache_debug_log,"aborted by pagecache timeout\n");
+ fclose(pagecache_debug_log);
+ abort();
+#endif
+ pagecache_dump();
+ }
+
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT);
+#else
+ assert(rc != ETIMEDOUT);
+#endif
+ return rc;
+}
+#else
+#if defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex)
+{
+ int rc;
+ KEYCACHE_THREAD_TRACE_END("started waiting");
+ rc= pthread_cond_wait(cond, mutex);
+ KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+ return rc;
+}
+#endif
+#endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+ int rc;
+ rc= pthread_mutex_lock(mutex);
+ KEYCACHE_THREAD_TRACE_BEGIN("");
+ return rc;
+}
+
+
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+ KEYCACHE_THREAD_TRACE_END("");
+ pthread_mutex_unlock(mutex);
+}
+
+
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond)
+{
+ int rc;
+ KEYCACHE_THREAD_TRACE("signal");
+ rc= pthread_cond_signal(cond);
+ return rc;
+}
+
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+static void pagecache_debug_print(const char * fmt, ...)
+{
+ va_list args;
+ va_start(args,fmt);
+ if (pagecache_debug_log)
+ {
+ VOID(vfprintf(pagecache_debug_log, fmt, args));
+ VOID(fputc('\n',pagecache_debug_log));
+ }
+ va_end(args);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+void pagecache_debug_log_close(void)
+{
+ if (pagecache_debug_log)
+ fclose(pagecache_debug_log);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#endif /* defined(PAGECACHE_DEBUG) */
diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h
new file mode 100644
index 00000000000..0e2aff3644d
--- /dev/null
+++ b/storage/maria/ma_pagecache.h
@@ -0,0 +1,267 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Page cache variable structures */
+
+#ifndef _ma_pagecache_h
+#define _ma_pagecache_h
+C_MODE_START
+
+#include "ma_loghandler_lsn.h"
+#include <m_string.h>
+
+/* Type of the page */
+enum pagecache_page_type
+{
+ /*
+ Used only for control page type changing during debugging. This define
+ should only be using when using DBUG.
+ */
+ PAGECACHE_EMPTY_PAGE,
+ /* the page does not contain LSN */
+ PAGECACHE_PLAIN_PAGE,
+ /* the page contain LSN (maria tablespace page) */
+ PAGECACHE_LSN_PAGE,
+ /* Page type used when scanning file and we don't care about the type */
+ PAGECACHE_READ_UNKNOWN_PAGE
+};
+
+/*
+ This enum describe lock status changing. every type of page cache will
+ interpret WRITE/READ lock as it need.
+*/
+enum pagecache_page_lock
+{
+ PAGECACHE_LOCK_LEFT_UNLOCKED, /* free -> free */
+ PAGECACHE_LOCK_LEFT_READLOCKED, /* read -> read */
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, /* write -> write */
+ PAGECACHE_LOCK_READ, /* free -> read */
+ PAGECACHE_LOCK_WRITE, /* free -> write */
+ PAGECACHE_LOCK_READ_UNLOCK, /* read -> free */
+ PAGECACHE_LOCK_WRITE_UNLOCK, /* write -> free */
+ PAGECACHE_LOCK_WRITE_TO_READ /* write -> read */
+};
+/*
+ This enum describe pin status changing
+*/
+enum pagecache_page_pin
+{
+ PAGECACHE_PIN_LEFT_PINNED, /* pinned -> pinned */
+ PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */
+ PAGECACHE_PIN, /* unpinned -> pinned */
+ PAGECACHE_UNPIN /* pinned -> unpinned */
+};
+/* How to write the page */
+enum pagecache_write_mode
+{
+ /* do not write immediately, i.e. it will be dirty page */
+ PAGECACHE_WRITE_DELAY,
+ /* page already is in the file. (key cache insert analogue) */
+ PAGECACHE_WRITE_DONE
+};
+
+typedef void *PAGECACHE_PAGE_LINK;
+
+/* file descriptor for Maria */
+typedef struct st_pagecache_file
+{
+ File file;
+} PAGECACHE_FILE;
+
+/* page number for maria */
+typedef uint32 pgcache_page_no_t;
+
+/* declare structures that is used by st_pagecache */
+
+struct st_pagecache_block_link;
+typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK;
+struct st_pagecache_page;
+typedef struct st_pagecache_page PAGECACHE_PAGE;
+struct st_pagecache_hash_link;
+typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK;
+
+#include <wqueue.h>
+
+typedef my_bool (*pagecache_disk_read_validator)(uchar *page, uchar *data);
+
+#define PAGECACHE_CHANGED_BLOCKS_HASH 128 /* must be power of 2 */
+
+/*
+ The page cache structure
+ It also contains read-only statistics parameters.
+*/
+
+typedef struct st_pagecache
+{
+ my_bool inited;
+ my_bool resize_in_flush; /* true during flush of resize operation */
+ my_bool can_be_used; /* usage of cache for read/write is allowed */
+ uint shift; /* block size = 2 ^ shift */
+ size_t mem_size; /* specified size of the cache memory */
+ uint32 block_size; /* size of the page buffer of a cache block */
+ ulong min_warm_blocks; /* min number of warm blocks; */
+ ulong age_threshold; /* age threshold for hot blocks */
+ ulonglong time; /* total number of block link operations */
+ uint hash_entries; /* max number of entries in the hash table */
+ int hash_links; /* max number of hash links */
+ int hash_links_used; /* number of hash links taken from free links pool */
+ int disk_blocks; /* max number of blocks in the cache */
+ ulong blocks_used; /* maximum number of concurrently used blocks */
+ ulong blocks_unused; /* number of currently unused blocks */
+ ulong blocks_changed; /* number of currently dirty blocks */
+ ulong warm_blocks; /* number of blocks in warm sub-chain */
+ ulong cnt_for_resize_op; /* counter to block resize operation */
+ ulong blocks_available; /* number of blocks available in the LRU chain */
+ PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */
+ PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links */
+ PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links */
+ PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks */
+ PAGECACHE_BLOCK_LINK *block_root;/* memory for block links */
+ uchar HUGE_PTR *block_mem; /* memory for block buffers */
+ PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain */
+ PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain */
+ pthread_mutex_t cache_lock; /* to lock access to the cache structure */
+ WQUEUE resize_queue; /* threads waiting during resize operation */
+ WQUEUE waiting_for_hash_link;/* waiting for a free hash link */
+ WQUEUE waiting_for_block; /* requests waiting for a free block */
+ /* hash for dirty file bl.*/
+ PAGECACHE_BLOCK_LINK *changed_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+ /* hash for other file bl.*/
+ PAGECACHE_BLOCK_LINK *file_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+
+ /*
+ The following variables are and variables used to hold parameters for
+ initializing the key cache.
+ */
+
+ ulonglong param_buff_size; /* size the memory allocated for the cache */
+ ulong param_block_size; /* size of the blocks in the key cache */
+ ulong param_division_limit; /* min. percentage of warm blocks */
+ ulong param_age_threshold; /* determines when hot block is downgraded */
+
+ /* Statistics variables. These are reset in reset_pagecache_counters(). */
+ ulong global_blocks_changed; /* number of currently dirty blocks */
+ ulonglong global_cache_w_requests;/* number of write requests (write hits) */
+ ulonglong global_cache_write; /* number of writes from cache to files */
+ ulonglong global_cache_r_requests;/* number of read requests (read hits) */
+ ulonglong global_cache_read; /* number of reads from files to cache */
+
+ int blocks; /* max number of blocks in the cache */
+ my_bool in_init; /* Set to 1 in MySQL during init/resize */
+} PAGECACHE;
+
+/* The default key cache */
+extern PAGECACHE dflt_pagecache_var, *dflt_pagecache;
+
+extern int init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+ uint division_limit, uint age_threshold,
+ uint block_size);
+extern int resize_pagecache(PAGECACHE *pagecache,
+ size_t use_mem, uint division_limit,
+ uint age_threshold);
+extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+ uint age_threshold);
+
+#define pagecache_read(P,F,N,L,B,T,K,I) \
+ pagecache_valid_read(P,F,N,L,B,T,K,I,0,0)
+
+extern uchar *pagecache_valid_read(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ PAGECACHE_PAGE_LINK *link,
+ pagecache_disk_read_validator validator,
+ uchar* validator_data);
+
+#define pagecache_write(P,F,N,L,B,T,O,I,M,K) \
+ pagecache_write_part(P,F,N,L,B,T,O,I,M,K,0,(P)->block_size,0,0)
+
+#define pagecache_inject(P,F,N,L,B,T,O,I,K,V,D) \
+ pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \
+ K,0,(P)->block_size,V,D)
+
+extern my_bool pagecache_write_part(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ enum pagecache_write_mode write_mode,
+ PAGECACHE_PAGE_LINK *link,
+ uint offset,
+ uint size,
+ pagecache_disk_read_validator validator,
+ uchar* validator_data);
+extern void pagecache_unlock(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn);
+extern void pagecache_unlock_by_link(PAGECACHE *pagecache,
+ PAGECACHE_PAGE_LINK *link,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn);
+extern void pagecache_unpin(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ LSN lsn);
+extern void pagecache_unpin_by_link(PAGECACHE *pagecache,
+ PAGECACHE_PAGE_LINK *link,
+ LSN lsn);
+extern int flush_pagecache_blocks(PAGECACHE *keycache,
+ PAGECACHE_FILE *file,
+ enum flush_type type);
+extern my_bool pagecache_delete(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ my_bool flush);
+extern my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint page_count,
+ enum pagecache_page_lock lock,
+ my_bool flush);
+extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *min_lsn);
+extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache);
+
+
+/* Functions to handle multiple key caches */
+extern my_bool multi_pagecache_init(void);
+extern void multi_pagecache_free(void);
+extern PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+ PAGECACHE *def);
+extern my_bool multi_pagecache_set(const uchar *key, uint length,
+ PAGECACHE *pagecache);
+extern void multi_pagecache_change(PAGECACHE *old_data,
+ PAGECACHE *new_data);
+extern int reset_pagecache_counters(const char *name,
+ PAGECACHE *pagecache);
+
+C_MODE_END
+#endif /* _keycache_h */
diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c
new file mode 100644
index 00000000000..a9460be10c5
--- /dev/null
+++ b/storage/maria/ma_pagecaches.c
@@ -0,0 +1,105 @@
+/* Copyright (C) 2003-2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Handling of multiple key caches
+
+ The idea is to have a thread safe hash on the table name,
+ with a default key cache value that is returned if the table name is not in
+ the cache.
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include <hash.h>
+#include <m_string.h>
+#include "../../mysys/my_safehash.h"
+
+/*****************************************************************************
+ Functions to handle the pagecache objects
+*****************************************************************************/
+
+/* Variable to store all key cache objects */
+static SAFE_HASH pagecache_hash;
+
+
+my_bool multi_pagecache_init(void)
+{
+ return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache);
+}
+
+
+void multi_pagecache_free(void)
+{
+ safe_hash_free(&pagecache_hash);
+}
+
+/*
+ Get a key cache to be used for a specific table.
+
+ SYNOPSIS
+ multi_pagecache_search()
+ key key to find (usually table path)
+ uint length Length of key.
+ def Default value if no key cache
+
+ NOTES
+ This function is coded in such a way that we will return the
+ default key cache even if one never called multi_pagecache_init.
+ This will ensure that it works with old MyISAM clients.
+
+ RETURN
+ key cache to use
+*/
+
+PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+ PAGECACHE *def)
+{
+ if (!pagecache_hash.hash.records)
+ return def;
+ return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length,
+ (void*) def);
+}
+
+
+/*
+ Assosiate a key cache with a key
+
+
+ SYONOPSIS
+ multi_pagecache_set()
+ key key (path to table etc..)
+ length Length of key
+ pagecache cache to assococite with the table
+
+ NOTES
+ This can be used both to insert a new entry and change an existing
+ entry
+*/
+
+
+my_bool multi_pagecache_set(const uchar *key, uint length,
+ PAGECACHE *pagecache)
+{
+ return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache);
+}
+
+
+void multi_pagecache_change(PAGECACHE *old_data,
+ PAGECACHE *new_data)
+{
+ safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data);
+}
diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c
new file mode 100644
index 00000000000..0394f630343
--- /dev/null
+++ b/storage/maria/ma_panic.c
@@ -0,0 +1,134 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+
+/*
+ Stop usage of Maria
+
+ SYNOPSIS
+ maria_panic()
+ flag HA_PANIC_CLOSE: All maria files (tables and log) are closed.
+ maria_end() is called.
+ HA_PANIC_WRITE: All misam files are unlocked and
+ all changed data in single user maria is
+ written to file
+ HA_PANIC_READ All maria files that was locked when
+ maria_panic(HA_PANIC_WRITE) was done is
+ locked. A maria_readinfo() is done for
+ all single user files to get changes
+ in database
+
+ RETURN
+ 0 ok
+ # error number in case of error
+*/
+
+int maria_panic(enum ha_panic_function flag)
+{
+ int error=0;
+ LIST *list_element,*next_open;
+ MARIA_HA *info;
+ DBUG_ENTER("maria_panic");
+
+ if (!maria_inited)
+ DBUG_RETURN(0);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (list_element=maria_open_list ; list_element ; list_element=next_open)
+ {
+ next_open=list_element->next; /* Save if close */
+ info=(MARIA_HA*) list_element->data;
+ switch (flag) {
+ case HA_PANIC_CLOSE:
+ /*
+ If bad luck (if some tables would be used now, which normally does not
+ happen in MySQL), as we release the mutex, the list may change and so
+ we may crash.
+ */
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ if (maria_close(info))
+ error=my_errno;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ break;
+ case HA_PANIC_WRITE: /* Do this to free databases */
+#ifdef CANT_OPEN_FILES_TWICE
+ if (info->s->options & HA_OPTION_READ_ONLY_DATA)
+ break;
+#endif
+ if (flush_pagecache_blocks(info->s->pagecache, &info->s->kfile,
+ FLUSH_RELEASE))
+ error=my_errno;
+ if (info->opt_flag & WRITE_CACHE_USED)
+ if (flush_io_cache(&info->rec_cache))
+ error=my_errno;
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (flush_io_cache(&info->rec_cache))
+ error=my_errno;
+ reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+ (pbool) (info->lock_type != F_UNLCK),1);
+ }
+ if (info->lock_type != F_UNLCK && ! info->was_locked)
+ {
+ info->was_locked=info->lock_type;
+ if (maria_lock_database(info,F_UNLCK))
+ error=my_errno;
+ }
+#ifdef CANT_OPEN_FILES_TWICE
+ if (info->s->kfile.file >= 0 && my_close(info->s->kfile.file, MYF(0)))
+ error = my_errno;
+ if (info->dfile.file >= 0 && my_close(info->dfile.file, MYF(0)))
+ error = my_errno;
+ info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */
+ break;
+#endif
+ case HA_PANIC_READ: /* Restore to before WRITE */
+#ifdef CANT_OPEN_FILES_TWICE
+ { /* Open closed files */
+ char name_buff[FN_REFLEN];
+ if (info->s->kfile.file < 0)
+ if ((info->s->kfile.file= my_open(fn_format(name_buff,
+ info->filename, "",
+ N_NAME_IEXT,4),
+ info->mode,
+ MYF(MY_WME))) < 0)
+ error = my_errno;
+ if (info->dfile.file < 0)
+ {
+ if ((info->dfile.file= my_open(fn_format(name_buff, info->filename,
+ "", N_NAME_DEXT, 4),
+ info->mode,
+ MYF(MY_WME))) < 0)
+ error = my_errno;
+ info->rec_cache.file= info->dfile.file;
+ }
+ }
+#endif
+ if (info->was_locked)
+ {
+ if (maria_lock_database(info, info->was_locked))
+ error=my_errno;
+ info->was_locked=0;
+ }
+ break;
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ if (flag == HA_PANIC_CLOSE)
+ maria_end();
+ if (!error)
+ DBUG_RETURN(0);
+ DBUG_RETURN(my_errno=error);
+} /* maria_panic */
diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c
new file mode 100644
index 00000000000..138bb94f7d0
--- /dev/null
+++ b/storage/maria/ma_preload.c
@@ -0,0 +1,133 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Preload indexes into key cache
+*/
+
+#include "maria_def.h"
+
+
+/*
+ Preload pages of the index file for a table into the key cache
+
+ SYNOPSIS
+ maria_preload()
+ info open table
+ map map of indexes to preload into key cache
+ ignore_leaves only non-leaves pages are to be preloaded
+
+ RETURN VALUE
+ 0 if a success. error code - otherwise.
+
+ NOTES.
+ At present pages for all indexes are preloaded.
+ In future only pages for indexes specified in the key_map parameter
+ of the table will be preloaded.
+*/
+
+int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves)
+{
+ uint i;
+ ulong length, block_length= 0;
+ uchar *buff= NULL;
+ MARIA_SHARE* share= info->s;
+ uint keys= share->state.header.keys;
+ MARIA_KEYDEF *keyinfo= share->keyinfo;
+ my_off_t key_file_length= share->state.state.key_file_length;
+ my_off_t pos= share->base.keystart;
+ DBUG_ENTER("maria_preload");
+
+ if (!keys || !maria_is_any_key_active(key_map) || key_file_length == pos)
+ DBUG_RETURN(0);
+
+ block_length= keyinfo[0].block_length;
+
+ if (ignore_leaves)
+ {
+ /* Check whether all indexes use the same block size */
+ for (i= 1 ; i < keys ; i++)
+ {
+ if (keyinfo[i].block_length != block_length)
+ DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE);
+ }
+ }
+ else
+ block_length= share->pagecache->block_size;
+
+ length= info->preload_buff_size/block_length * block_length;
+ set_if_bigger(length, block_length);
+
+ if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME))))
+ DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
+
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+ goto err;
+
+ do
+ {
+ /* Read the next block of index file into the preload buffer */
+ if ((my_off_t) length > (key_file_length-pos))
+ length= (ulong) (key_file_length-pos);
+ if (my_pread(share->kfile.file, (uchar*) buff, length, pos,
+ MYF(MY_FAE|MY_FNABP)))
+ goto err;
+
+ if (ignore_leaves)
+ {
+ uchar *end= buff+length;
+ do
+ {
+ if (_ma_test_if_nod(buff))
+ {
+ DBUG_ASSERT(share->pagecache->block_size == block_length);
+ if (pagecache_write(share->pagecache,
+ &share->kfile, pos / block_length,
+ DFLT_INIT_HITS,
+ (uchar*) buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0))
+ goto err;
+ }
+ pos+= block_length;
+ }
+ while ((buff+= block_length) != end);
+ buff= end-length;
+ }
+ else
+ {
+ if (pagecache_write(share->pagecache,
+ &share->kfile, pos / block_length,
+ DFLT_INIT_HITS,
+ (uchar*) buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0))
+ goto err;
+ pos+= length;
+ }
+ }
+ while (pos != key_file_length);
+
+ my_free((char*) buff, MYF(0));
+ DBUG_RETURN(0);
+
+err:
+ my_free((char*) buff, MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(my_errno= errno);
+}
diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c
new file mode 100644
index 00000000000..02616d8ac5c
--- /dev/null
+++ b/storage/maria/ma_range.c
@@ -0,0 +1,295 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Gives a approximated number of how many records there is between two keys.
+ Used when optimizing querries.
+ */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map,
+ enum ha_rkey_function);
+static double _ma_search_pos(MARIA_HA *, MARIA_KEYDEF *, uchar *,
+ uint, uint, my_off_t);
+static uint _ma_keynr(MARIA_HA *, MARIA_KEYDEF *, uchar *, uchar *, uint *);
+
+
+/**
+ @brief Estimate how many records there is in a given range
+
+ @param info MARIA handler
+ @param inx Index to use
+ @param min_key Min key. Is = 0 if no min range
+ @param max_key Max key. Is = 0 if no max range
+
+ @note
+ We should ONLY return 0 if there is no rows in range
+
+ @return Estimated number of rows or error
+ @retval HA_POS_ERROR error (or we can't estimate number of rows)
+ @retval number Estimated number of rows
+*/
+
+ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
+ key_range *max_key)
+{
+ ha_rows start_pos,end_pos,res;
+ DBUG_ENTER("maria_records_in_range");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(HA_POS_ERROR);
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(HA_POS_ERROR);
+ info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED);
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->key_root_lock[inx]);
+
+ switch(info->s->keyinfo[inx].key_alg){
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ {
+ uchar *key_buff;
+ uint start_key_len;
+
+ /*
+ The problem is that the optimizer doesn't support
+ RTree keys properly at the moment.
+ Hope this will be fixed some day.
+ But now NULL in the min_key means that we
+ didn't make the task for the RTree key
+ and expect BTree functionality from it.
+ As it's not able to handle such request
+ we return the error.
+ */
+ if (!min_key)
+ {
+ res= HA_POS_ERROR;
+ break;
+ }
+ key_buff= info->lastkey+info->s->base.max_key_length;
+ start_key_len= _ma_pack_key(info,inx, key_buff,
+ min_key->key, min_key->keypart_map,
+ (HA_KEYSEG**) 0);
+ res= maria_rtree_estimate(info, inx, key_buff, start_key_len,
+ maria_read_vec[min_key->flag]);
+ res= res ? res : 1; /* Don't return 0 */
+ break;
+ }
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ start_pos= (min_key ?
+ _ma_record_pos(info, min_key->key, min_key->keypart_map,
+ min_key->flag) :
+ (ha_rows) 0);
+ end_pos= (max_key ?
+ _ma_record_pos(info, max_key->key, max_key->keypart_map,
+ max_key->flag) :
+ info->state->records + (ha_rows) 1);
+ res= (end_pos < start_pos ? (ha_rows) 0 :
+ (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos));
+ if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR)
+ res=HA_POS_ERROR;
+ }
+
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->key_root_lock[inx]);
+ fast_ma_writeinfo(info);
+
+ /**
+ @todo LOCK
+ If res==0 (no rows), if we need to guarantee repeatability of the search,
+ we will need to set a next-key lock in this statement.
+ Also SELECT COUNT(*)...
+ */
+
+ DBUG_PRINT("info",("records: %ld",(ulong) (res)));
+ DBUG_RETURN(res);
+}
+
+
+ /* Find relative position (in records) for key in index-tree */
+
+static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function search_flag)
+{
+ uint inx=(uint) info->lastinx, nextflag, key_len;
+ MARIA_KEYDEF *keyinfo=info->s->keyinfo+inx;
+ uchar *key_buff;
+ double pos;
+ DBUG_ENTER("_ma_record_pos");
+ DBUG_PRINT("enter",("search_flag: %d",search_flag));
+ DBUG_ASSERT(keypart_map);
+
+ key_buff=info->lastkey+info->s->base.max_key_length;
+ key_len= _ma_pack_key(info, inx, key_buff, key, keypart_map,
+ (HA_KEYSEG**) 0);
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, keyinfo->seg,
+ key_buff, key_len););
+ nextflag=maria_read_vec[search_flag];
+ if (!(nextflag & (SEARCH_FIND | SEARCH_NO_FIND | SEARCH_LAST)))
+ key_len=USE_WHOLE_KEY;
+
+ /*
+ my_handler.c:mi_compare_text() has a flag 'skip_end_space'.
+ This is set in my_handler.c:ha_key_cmp() in dependence on the
+ compare flags 'nextflag' and the column type.
+
+ TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the
+ condition is skip_end_space= ((nextflag & (SEARCH_FIND |
+ SEARCH_UPDATE)) == SEARCH_FIND).
+
+ SEARCH_FIND is used for an exact key search. The combination
+ SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete
+ operations with a comment like "Not real duplicates", whatever this
+ means. From the condition above we can see that 'skip_end_space' is
+ always false for these operations. The result is that trailing space
+ counts in key comparison and hence, emtpy strings ('', string length
+ zero, but not NULL) compare less that strings starting with control
+ characters and these in turn compare less than strings starting with
+ blanks.
+
+ When estimating the number of records in a key range, we request an
+ exact search for the minimum key. This translates into a plain
+ SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space'
+ compare. Empty strings would be expected above control characters.
+ Their keys would not be found because they are located below control
+ characters.
+
+ This is the reason that we add the SEARCH_UPDATE flag here. It makes
+ the key estimation compare in the same way like key write operations
+ do. Olny so we will find the keys where they have been inserted.
+
+ Adding the flag unconditionally does not hurt as it is used in the
+ above mentioned condition only. So it can safely be used together
+ with other flags.
+ */
+ pos= _ma_search_pos(info,keyinfo, key_buff, key_len,
+ nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE,
+ info->s->state.key_root[inx]);
+ if (pos >= 0.0)
+ {
+ DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records)));
+ DBUG_RETURN((ulong) (pos*info->state->records+0.5));
+ }
+ DBUG_RETURN(HA_POS_ERROR);
+}
+
+
+ /* This is a modified version of _ma_search */
+ /* Returns offset for key in indextable (decimal 0.0 <= x <= 1.0) */
+
+static double _ma_search_pos(register MARIA_HA *info,
+ register MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_len, uint nextflag,
+ register my_off_t pos)
+{
+ int flag;
+ uint nod_flag,keynr,max_keynr;
+ my_bool after_key;
+ uchar *keypos, *buff;
+ double offset;
+ DBUG_ENTER("_ma_search_pos");
+ LINT_INIT(max_keynr);
+
+ if (pos == HA_OFFSET_ERROR)
+ DBUG_RETURN(0.5);
+
+ if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->buff,1)))
+ goto err;
+ flag=(*keyinfo->bin_search)(info, keyinfo, buff, key, key_len, nextflag,
+ &keypos,info->lastkey, &after_key);
+ nod_flag=_ma_test_if_nod(buff);
+ keynr= _ma_keynr(info,keyinfo,buff,keypos,&max_keynr);
+
+ if (flag)
+ {
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ DBUG_RETURN(-1); /* error */
+ /*
+ Didn't found match. keypos points at next (bigger) key
+ Try to find a smaller, better matching key.
+ Matches keynr + [0-1]
+ */
+ if (flag > 0 && ! nod_flag)
+ offset= 1.0;
+ else if ((offset= _ma_search_pos(info,keyinfo,key,key_len,nextflag,
+ _ma_kpos(nod_flag,keypos))) < 0)
+ DBUG_RETURN(offset);
+ }
+ else
+ {
+ /*
+ Found match. Keypos points at the start of the found key
+ Matches keynr+1
+ */
+ offset=1.0; /* Matches keynr+1 */
+ if ((nextflag & SEARCH_FIND) && nod_flag &&
+ ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+ key_len != USE_WHOLE_KEY))
+ {
+ /*
+ There may be identical keys in the tree. Try to match on of those.
+ Matches keynr + [0-1]
+ */
+ if ((offset= _ma_search_pos(info,keyinfo,key,key_len,SEARCH_FIND,
+ _ma_kpos(nod_flag,keypos))) < 0)
+ DBUG_RETURN(offset); /* Read error */
+ }
+ }
+ DBUG_PRINT("info",("keynr: %d offset: %g max_keynr: %d nod: %d flag: %d",
+ keynr,offset,max_keynr,nod_flag,flag));
+ DBUG_RETURN((keynr+offset)/(max_keynr+1));
+err:
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN (-1.0);
+}
+
+
+ /* Get keynummer of current key and max number of keys in nod */
+
+static uint _ma_keynr(MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *keypos, uint *ret_max_key)
+{
+ uint nod_flag,keynr,max_key;
+ uchar t_buff[HA_MAX_KEY_BUFF],*end;
+
+ end= page+maria_data_on_page(page);
+ nod_flag=_ma_test_if_nod(page);
+ page+=2+nod_flag;
+
+ if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ {
+ *ret_max_key= (uint) (end-page)/(keyinfo->keylength+nod_flag);
+ return (uint) (keypos-page)/(keyinfo->keylength+nod_flag);
+ }
+
+ max_key=keynr=0;
+ t_buff[0]=0; /* Safety */
+ while (page < end)
+ {
+ if (!(*keyinfo->get_key)(keyinfo,nod_flag,&page,t_buff))
+ return 0; /* Error */
+ max_key++;
+ if (page == keypos)
+ keynr=max_key;
+ }
+ *ret_max_key=max_key;
+ return(keynr);
+}
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
new file mode 100644
index 00000000000..e740e334b5f
--- /dev/null
+++ b/storage/maria/ma_recovery.c
@@ -0,0 +1,2249 @@
+/* Copyright (C) 2006, 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3072 Maria recovery
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+
+struct st_trn_for_recovery /* used only in the REDO phase */
+{
+ LSN group_start_lsn, undo_lsn, first_undo_lsn;
+ TrID long_trid;
+};
+struct st_dirty_page /* used only in the REDO phase */
+{
+ uint64 file_and_page_id;
+ LSN rec_lsn;
+};
+struct st_table_for_recovery /* used in the REDO and UNDO phase */
+{
+ MARIA_HA *info;
+ File org_kfile, org_dfile; /**< OS descriptors when Checkpoint saw table */
+};
+/* Variables used by all functions of this module. Ok as single-threaded */
+static struct st_trn_for_recovery *all_active_trans;
+static struct st_table_for_recovery *all_tables;
+static HASH all_dirty_pages;
+static struct st_dirty_page *dirty_pages_pool;
+static LSN current_group_end_lsn,
+ checkpoint_start= LSN_IMPOSSIBLE;
+static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */
+static FILE *tracef; /**< trace file for debugging */
+static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
+
+#define prototype_redo_exec_hook(R) \
+ static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec)
+
+#define prototype_redo_exec_hook_dummy(R) \
+ static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec \
+ __attribute ((unused)))
+
+#define prototype_undo_exec_hook(R) \
+ static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn)
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID);
+prototype_redo_exec_hook_dummy(CHECKPOINT);
+prototype_redo_exec_hook(REDO_CREATE_TABLE);
+prototype_redo_exec_hook(REDO_RENAME_TABLE);
+prototype_redo_exec_hook(REDO_REPAIR_TABLE);
+prototype_redo_exec_hook(REDO_DROP_TABLE);
+prototype_redo_exec_hook(FILE_ID);
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+prototype_redo_exec_hook(REDO_PURGE_BLOCKS);
+prototype_redo_exec_hook(REDO_DELETE_ALL);
+prototype_redo_exec_hook(UNDO_ROW_INSERT);
+prototype_redo_exec_hook(UNDO_ROW_DELETE);
+prototype_redo_exec_hook(UNDO_ROW_UPDATE);
+prototype_redo_exec_hook(COMMIT);
+prototype_redo_exec_hook(CLR_END);
+prototype_undo_exec_hook(UNDO_ROW_INSERT);
+prototype_undo_exec_hook(UNDO_ROW_DELETE);
+prototype_undo_exec_hook(UNDO_ROW_UPDATE);
+
+static int run_redo_phase(LSN lsn, my_bool apply);
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase);
+static int run_undo_phase(uint unfinished);
+static void display_record_position(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec,
+ uint number);
+static int display_and_apply_record(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec);
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon);
+static LSN parse_checkpoint_record(LSN lsn);
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+ LSN first_undo_lsn);
+static int new_table(uint16 sid, const char *name,
+ File org_kfile, File org_dfile,
+ LSN lsn_of_file_id);
+static int new_page(File fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+ struct st_dirty_page *dirty_page);
+static int close_all_tables(void);
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr);
+
+/** @brief global [out] buffer for translog_read_record(); never shrinks */
+static LEX_STRING log_record_buffer;
+static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
+{
+ if (log_record_buffer.length < rec->record_length)
+ {
+ log_record_buffer.length= rec->record_length;
+ log_record_buffer.str= my_realloc(log_record_buffer.str,
+ rec->record_length,
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ }
+}
+static my_bool redo_phase_message_printed;
+/** @brief Prints to a trace file if it is not NULL */
+void tprint(FILE *trace_file, const char *format, ...)
+ ATTRIBUTE_FORMAT(printf, 2, 3);
+void tprint(FILE *trace_file, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ if (trace_file != NULL)
+ vfprintf(trace_file, format, args);
+ va_end(args);
+}
+
+#define ALERT_USER() DBUG_ASSERT(0)
+
+
+/**
+ @brief Recovers from the last checkpoint.
+
+ Runs the REDO phase using special structures, then sets up the playground
+ of runtime: recreates transactions inside trnman, open tables with their
+ two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all
+ tables.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_recover(void)
+{
+ int res= 1;
+ FILE *trace_file;
+ DBUG_ENTER("maria_recover");
+
+ DBUG_ASSERT(!maria_in_recovery);
+ maria_in_recovery= TRUE;
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ trace_file= fopen("maria_recovery.trace", "w");
+#else
+ trace_file= NULL; /* no trace file for being fast */
+#endif
+ tprint(trace_file, "TRACE of the last MARIA recovery from mysqld\n");
+ DBUG_ASSERT(maria_pagecache->inited);
+ res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE, TRUE);
+ if (!res)
+ tprint(trace_file, "SUCCESS\n");
+ if (trace_file)
+ fclose(trace_file);
+ maria_in_recovery= FALSE;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Displays and/or applies the log
+
+ @param from_lsn LSN from which log reading/applying should start;
+ LSN_IMPOSSIBLE means "use last checkpoint"
+ @param apply if log records should be applied or not
+ @param trace_file trace file where progress/debug messages will go
+ @param skip_DDLs Should DDL records (CREATE/RENAME/DROP/REPAIR)
+ be skipped by the REDO phase or not
+
+ @todo This trace_file thing is primitive; soon we will make it similar to
+ ma_check_print_warning() etc, and a successful recovery does not need to
+ create a trace file. But for debugging now it is useful.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
+ my_bool should_run_undo_phase, my_bool skip_DDLs_arg)
+{
+ int error= 0;
+ uint unfinished_trans;
+ DBUG_ENTER("maria_apply_log");
+
+ DBUG_ASSERT(apply || !should_run_undo_phase);
+ DBUG_ASSERT(!maria_multi_threaded);
+ all_active_trans= (struct st_trn_for_recovery *)
+ my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery),
+ MYF(MY_ZEROFILL));
+ all_tables= (struct st_table_for_recovery *)
+ my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery),
+ MYF(MY_ZEROFILL));
+ if (!all_active_trans || !all_tables)
+ goto err;
+
+ redo_phase_message_printed= FALSE;
+ tracef= trace_file;
+ if (!(skip_DDLs= skip_DDLs_arg))
+ {
+ /*
+ Example of what can go wrong when replaying DDLs:
+ CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged);
+ ALTER TABLE t ... which does
+ CREATE a temporary table #sql... (logged)
+ INSERT data from t into #sql... (not logged)
+ RENAME #sql TO t (logged)
+ Removing tables by hand and replaying the log will leave in the
+ end an empty table "t": missing records. If after the RENAME an INSERT
+ into t was done, that row had number 1 in its page, executing the
+ REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion
+ failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is
+ created whereas rownr is not 0).
+ Another issue is that replaying of DDLs is not correct enough to work if
+ there was a crash during a DDL (see comment in execution of
+ REDO_RENAME_TABLE ).
+ */
+ tprint(tracef, "WARNING: MySQL server currently disables log records"
+ " about insertion of data by ALTER TABLE"
+ " (copy_data_between_tables()), applying of log records may"
+ " well not work. Additionally, applying of DDL records will"
+ " cause damage if there are tables left by a crash of a DDL.\n");
+ }
+
+ if (from_lsn == LSN_IMPOSSIBLE)
+ {
+ if (last_checkpoint_lsn == LSN_IMPOSSIBLE)
+ {
+ from_lsn= translog_first_theoretical_lsn();
+ /*
+ as far as we have not yet any checkpoint then the very first
+ log file should be present.
+ */
+ if (unlikely((from_lsn == LSN_IMPOSSIBLE) ||
+ (from_lsn == LSN_ERROR)))
+ goto err;
+ }
+ else
+ {
+ from_lsn= parse_checkpoint_record(last_checkpoint_lsn);
+ if (from_lsn == LSN_IMPOSSIBLE)
+ goto err;
+ from_lsn= translog_next_LSN(from_lsn, LSN_IMPOSSIBLE);
+ if (from_lsn == LSN_ERROR)
+ goto err;
+ /*
+ from_lsn LSN_IMPOSSIBLE will be correctly processed
+ by run_redo_phase()
+ */
+ }
+ }
+
+ if (run_redo_phase(from_lsn, apply))
+ goto err;
+
+ unfinished_trans= end_of_redo_phase(should_run_undo_phase);
+ if (unfinished_trans == (uint)-1)
+ goto err;
+ if (should_run_undo_phase)
+ {
+ if (run_undo_phase(unfinished_trans))
+ return 1;
+ }
+ else if (unfinished_trans > 0)
+ tprint(tracef, "WARNING: %u unfinished transactions; some tables may be"
+ " left inconsistent!\n", unfinished_trans);
+
+ /*
+ we don't use maria_panic() because it would maria_end(), and Recovery does
+ not want that (we want to keep some modules initialized for runtime).
+ */
+ if (close_all_tables())
+ goto err;
+
+ /* If inside ha_maria, a checkpoint will soon be taken and save our work */
+ goto end;
+err:
+ error= 1;
+ tprint(tracef, "Recovery of tables with transaction logs FAILED\n");
+end:
+ hash_free(&all_dirty_pages);
+ bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+ my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+ dirty_pages_pool= NULL;
+ my_free(all_tables, MYF(MY_ALLOW_ZERO_PTR));
+ all_tables= NULL;
+ my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+ all_active_trans= NULL;
+ my_free(log_record_buffer.str, MYF(MY_ALLOW_ZERO_PTR));
+ log_record_buffer.str= NULL;
+ log_record_buffer.length= 0;
+ if (tracef != stdout && redo_phase_message_printed)
+ {
+ /** @todo RECOVERY BUG all prints to stderr should go to error log */
+ fprintf(stderr, "\n");
+ }
+ /* we don't cleanly close tables if we hit some error (may corrupt them) */
+ DBUG_RETURN(error);
+}
+
+
+/* very basic info about the record's header */
+static void display_record_position(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec,
+ uint number)
+{
+ /*
+ if number==0, we're going over records which we had already seen and which
+ form a group, so we indent below the group's end record
+ */
+ tprint(tracef, "%sRec#%u LSN (%lu,0x%lx) short_trid %u %s(num_type:%u) len %lu\n",
+ number ? "" : " ", number, LSN_IN_PARTS(rec->lsn),
+ rec->short_trid, log_desc->name, rec->type,
+ (ulong)rec->record_length);
+}
+
+
+static int display_and_apply_record(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec)
+{
+ int error;
+ if (log_desc->record_execute_in_redo_phase == NULL)
+ {
+ /* die on all not-yet-handled records :) */
+ DBUG_ASSERT("one more hook" == "to write");
+ return 1;
+ }
+ if ((error= (*log_desc->record_execute_in_redo_phase)(rec)))
+ tprint(tracef, "Got error when executing redo on record\n");
+ return error;
+}
+
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID)
+{
+ uint16 sid= rec->short_trid;
+ TrID long_trid= all_active_trans[sid].long_trid;
+ /* abort group of this trn (must be of before a crash) */
+ LSN gslsn= all_active_trans[sid].group_start_lsn;
+ if (gslsn != LSN_IMPOSSIBLE)
+ {
+ tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u aborted\n",
+ LSN_IN_PARTS(gslsn), sid);
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ }
+ if (long_trid != 0)
+ {
+ LSN ulsn= all_active_trans[sid].undo_lsn;
+ if (ulsn != LSN_IMPOSSIBLE)
+ {
+ char llbuf[22];
+ llstr(long_trid, llbuf);
+ tprint(tracef, "Found an old transaction long_trid %s short_trid %u"
+ " with same short id as this new transaction, and has neither"
+ " committed nor rollback (undo_lsn: (%lu,0x%lx))\n", llbuf,
+ sid, LSN_IN_PARTS(ulsn));
+ goto err;
+ }
+ }
+ long_trid= uint6korr(rec->header);
+ new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE);
+ goto end;
+err:
+ ALERT_USER();
+ return 1;
+end:
+ return 0;
+}
+
+
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+ LSN first_undo_lsn)
+{
+ char llbuf[22];
+ all_active_trans[sid].long_trid= long_id;
+ llstr(long_id, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u starts\n",
+ llbuf, sid);
+ all_active_trans[sid].undo_lsn= undo_lsn;
+ all_active_trans[sid].first_undo_lsn= first_undo_lsn;
+ set_if_bigger(max_long_trid, long_id);
+}
+
+
+prototype_redo_exec_hook_dummy(CHECKPOINT)
+{
+ /* the only checkpoint we care about was found via control file, ignore */
+ return 0;
+}
+
+
+prototype_redo_exec_hook(REDO_CREATE_TABLE)
+{
+ File dfile= -1, kfile= -1;
+ char *linkname_ptr, filename[FN_REFLEN];
+ char *name, *ptr;
+ myf create_flag;
+ uint flags;
+ int error= 1, create_mode= O_RDWR | O_TRUNC;
+ MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ name= log_record_buffer.str;
+ tprint(tracef, "Table '%s'", name);
+ /* we try hard to get create_rename_lsn, to avoid mistakes if possible */
+ info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /* check that we're not already using it */
+ if (share->reopen != 1)
+ {
+ tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ if (!share->base.born_transactional)
+ {
+ /*
+ could be that transactional table was later dropped, and a non-trans
+ one was renamed to its name, thus create_rename_lsn is 0 and should
+ not be trusted.
+ */
+ tprint(tracef, ", is not transactional, ignoring creation\n");
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring creation",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't recreate it");
+ ALERT_USER();
+ goto end;
+ }
+ maria_close(info);
+ info= NULL;
+ }
+ else /* one or two files absent, or header corrupted... */
+ tprint(tracef, "can't be opened, probably does not exist");
+ /* if does not exist, or is older, overwrite it */
+ /** @todo symlinks */
+ ptr= name + strlen(name) + 1;
+ if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0))
+ tprint(tracef, ", we will only touch index file");
+ fn_format(filename, name, "", MARIA_NAME_IEXT,
+ (MY_UNPACK_FILENAME |
+ (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) |
+ MY_APPEND_EXT);
+ linkname_ptr= NULL;
+ create_flag= MY_DELETE_OLD;
+ tprint(tracef, ", creating as '%s'", filename);
+ if ((kfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME|create_flag))) < 0)
+ {
+ tprint(tracef, "Failed to create index file\n");
+ goto end;
+ }
+ ptr++;
+ uint kfile_size_before_extension= uint2korr(ptr);
+ ptr+= 2;
+ uint keystart= uint2korr(ptr);
+ ptr+= 2;
+ /* set create_rename_lsn (for maria_read_log to be idempotent) */
+ lsn_store(ptr + sizeof(info->s->state.header) + 2, rec->lsn);
+ /* we also set is_of_horizon, like maria_create() does */
+ lsn_store(ptr + sizeof(info->s->state.header) + 2 + LSN_STORE_SIZE,
+ rec->lsn);
+ if (my_pwrite(kfile, ptr,
+ kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) ||
+ my_chsize(kfile, keystart, 0, MYF(MY_WME)))
+ {
+ tprint(tracef, "Failed to write to index file\n");
+ goto end;
+ }
+ if (!(flags & HA_DONT_TOUCH_DATA))
+ {
+ fn_format(filename,name,"", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= NULL;
+ create_flag=MY_DELETE_OLD;
+ if (((dfile=
+ my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME | create_flag))) < 0) ||
+ my_close(dfile, MYF(MY_WME)))
+ {
+ tprint(tracef, "Failed to create data file\n");
+ goto end;
+ }
+ /*
+ we now have an empty data file. To be able to
+ _ma_initialize_data_file() we need some pieces of the share to be
+ correctly filled. So we just open the table (fortunately, an empty
+ data file does not preclude this).
+ */
+ if (((info= maria_open(name, O_RDONLY, 0)) == NULL) ||
+ _ma_initialize_data_file(info->s, info->dfile.file))
+ {
+ tprint(tracef, "Failed to open new table or write to data file\n");
+ goto end;
+ }
+ }
+ error= 0;
+end:
+ tprint(tracef, "\n");
+ if (kfile >= 0)
+ error|= my_close(kfile, MYF(MY_WME));
+ if (info != NULL)
+ error|= maria_close(info);
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_RENAME_TABLE)
+{
+ char *old_name, *new_name;
+ int error= 1;
+ MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ old_name= log_record_buffer.str;
+ new_name= old_name + strlen(old_name) + 1;
+ tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
+ new_name);
+ /*
+ Here is why we skip CREATE/DROP/RENAME when doing a recovery from
+ ha_maria (whereas we do when called from maria_read_log). Consider:
+ CREATE TABLE t;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u; # crash between index rename and data rename.
+ And do a Recovery (not removing tables beforehand).
+ Recovery replays CREATE, then RENAME: the maria_open("t") works,
+ maria_open("u") does not (no data file) so table "u" is considered
+ inexistent and so maria_rename() is done which overwrites u's index file,
+ which is lost. Ok, the data file (v.MAD) is still available, but only a
+ REPAIR USE_FRM can rebuild the index, which is unsafe and downtime.
+ So it is preferrable to not execute RENAME, and leave the "mess" of files,
+ rather than possibly destroy a file. DBA will manually rename files.
+ A safe recovery method would probably require checking the existence of
+ the index file and of the data file separately (not via maria_open()), and
+ maybe also to store a create_rename_lsn in the data file too
+ For now, all we risk is to leave the mess (half-renamed files) left by the
+ crash. We however sync files and directories at each file rename. The SQL
+ layer is anyway not crash-safe for DDLs (except the repartioning-related
+ ones).
+ We replay DDLs in maria_read_log to be able to recreate tables from
+ scratch. It means that "maria_read_log -a" should not be used on a
+ database which just crashed during a DDL. And also ALTER TABLE does not
+ log insertions of records into the temporary table, so replaying may
+ fail (see comment and warning in maria_apply_log()).
+ */
+ info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /*
+ We may have open instances on this table. But it does not matter, the
+ maria_extra() below will take care of them.
+ */
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring renaming\n");
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring renaming",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't rename it");
+ ALERT_USER();
+ goto end;
+ }
+ /*
+ This maria_extra() call serves to signal that old open instances of
+ this table should not be used anymore, and (only on Windows) to close
+ open files so they can be renamed
+ */
+ if (maria_extra(info, HA_EXTRA_PREPARE_FOR_RENAME, NULL) ||
+ maria_close(info))
+ goto end;
+ info= NULL;
+ tprint(tracef, ", is ok for renaming; new-name table ");
+ }
+ else /* one or two files absent, or header corrupted... */
+ {
+ tprint(tracef, ", can't be opened, probably does not exist");
+ error= 0;
+ goto end;
+ }
+ /*
+ We must also check the create_rename_lsn of the 'new_name' table if it
+ exists: otherwise we may, with our rename which overwrites, destroy
+ another table. For example:
+ CREATE TABLE t;
+ RENAME t to u;
+ DROP TABLE u;
+ RENAME v to u; # v is an old table, its creation/insertions not in log
+ And start executing the log (without removing tables beforehand): creates
+ t, renames it to u (if not testing create_rename_lsn) thus overwriting
+ old-named v, drops u, and we are stuck, we have lost data.
+ */
+ info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /* We should not have open instances on this table. */
+ if (share->reopen != 1)
+ {
+ tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring renaming\n");
+ ALERT_USER();
+ goto drop;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring renaming",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ /*
+ We have to drop the old_name table. Consider:
+ CREATE TABLE t;
+ CREATE TABLE v;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u;
+ and apply the log without removing tables beforehand. t will be
+ created, v too; in REDO_RENAME u will be more recent, but we still
+ have to drop t otherwise it stays.
+ */
+ goto drop;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't rename it");
+ ALERT_USER();
+ goto end;
+ }
+ if (maria_close(info))
+ goto end;
+ info= NULL;
+ /* abnormal situation */
+ tprint(tracef, ", exists but is older than record, can't rename it");
+ goto end;
+ }
+ else /* one or two files absent, or header corrupted... */
+ tprint(tracef, ", can't be opened, probably does not exist");
+ tprint(tracef, ", renaming '%s'", old_name);
+ if (maria_rename(old_name, new_name))
+ {
+ tprint(tracef, "Failed to rename table\n");
+ goto end;
+ }
+ info= maria_open(new_name, O_RDONLY, 0);
+ if (info == NULL)
+ {
+ tprint(tracef, "Failed to open renamed table\n");
+ goto end;
+ }
+ if (_ma_update_create_rename_lsn(info->s, rec->lsn, TRUE))
+ goto end;
+ if (maria_close(info))
+ goto end;
+ info= NULL;
+ error= 0;
+ goto end;
+drop:
+ tprint(tracef, ", only dropping '%s'", old_name);
+ if (maria_delete_table(old_name))
+ {
+ tprint(tracef, "Failed to drop table\n");
+ goto end;
+ }
+ error= 0;
+ goto end;
+end:
+ tprint(tracef, "\n");
+ if (info != NULL)
+ error|= maria_close(info);
+ return error;
+}
+
+
+/*
+ The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE.
+*/
+prototype_redo_exec_hook(REDO_REPAIR_TABLE)
+{
+ int error= 1;
+ MARIA_HA *info;
+ if (skip_DDLs)
+ {
+ /*
+ REPAIR is not exactly a DDL, but it manipulates files without logging
+ insertions into them.
+ */
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
+ return 0;
+ /*
+ Otherwise, the mapping is newer than the table, and our record is newer
+ than the mapping, so we can repair.
+ */
+ tprint(tracef, " repairing...\n");
+ /**
+ @todo RECOVERY BUG fix this:
+ the maria_chk_init() call causes a heap of linker errors in ha_maria.cc!
+ */
+#if 0
+ HA_CHECK param;
+ maria_chk_init(&param);
+ param.isam_file_name= info->s->open_file_name;
+ param.testflag= uint4korr(rec->header);
+ if (maria_repair(&param, info, info->s->open_file_name,
+ param.testflag & T_QUICK))
+ goto end;
+ if (_ma_update_create_rename_lsn(info->s, rec->lsn, TRUE))
+ goto end;
+ error= 0;
+end:
+ return error;
+#else
+ DBUG_ASSERT("fix this table repairing" == NULL);
+ return error;
+#endif
+}
+
+
+prototype_redo_exec_hook(REDO_DROP_TABLE)
+{
+ char *name;
+ int error= 1;
+ MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ name= log_record_buffer.str;
+ tprint(tracef, "Table '%s'", name);
+ info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /*
+ We may have open instances on this table. But it does not matter, the
+ maria_extra() below will take care of them.
+ */
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring removal\n");
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring removal",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't drop it");
+ ALERT_USER();
+ goto end;
+ }
+ /*
+ This maria_extra() call serves to signal that old open instances of
+ this table should not be used anymore, and (only on Windows) to close
+ open files so they can be deleted
+ */
+ if (maria_extra(info, HA_EXTRA_PREPARE_FOR_DROP, NULL) ||
+ maria_close(info))
+ goto end;
+ info= NULL;
+ /* if it is older, or its header is corrupted, drop it */
+ tprint(tracef, ", dropping '%s'", name);
+ if (maria_delete_table(name))
+ {
+ tprint(tracef, "Failed to drop table\n");
+ goto end;
+ }
+ }
+ else /* one or two files absent, or header corrupted... */
+ tprint(tracef,", can't be opened, probably does not exist");
+ error= 0;
+end:
+ tprint(tracef, "\n");
+ if (info != NULL)
+ error|= maria_close(info);
+ return error;
+}
+
+
+prototype_redo_exec_hook(FILE_ID)
+{
+ uint16 sid;
+ int error= 1;
+ const char *name;
+ MARIA_HA *info;
+
+ if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0)
+ {
+ /*
+ If that mapping was still true at checkpoint time, it was found in
+ checkpoint record, no need to recreate it. If that mapping had ended at
+ checkpoint time (table was closed or repaired), a flush and force
+ happened and so mapping is not needed.
+ */
+ tprint(tracef, "ignoring because before checkpoint\n");
+ return 0;
+ }
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ sid= fileid_korr(log_record_buffer.str);
+ info= all_tables[sid].info;
+ if (info != NULL)
+ {
+ tprint(tracef, " Closing table '%s'\n", info->s->open_file_name);
+ prepare_table_for_close(info, rec->lsn);
+ if (maria_close(info))
+ {
+ tprint(tracef, "Failed to close table\n");
+ goto end;
+ }
+ all_tables[sid].info= NULL;
+ }
+ name= log_record_buffer.str + FILEID_STORE_SIZE;
+ if (new_table(sid, name, -1, -1, rec->lsn))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+static int new_table(uint16 sid, const char *name,
+ File org_kfile, File org_dfile,
+ LSN lsn_of_file_id)
+{
+ /*
+ -1 (skip table): close table and return 0;
+ 1 (error): close table and return 1;
+ 0 (success): leave table open and return 0.
+ */
+ int error= 1;
+
+ tprint(tracef, "Table '%s', id %u", name, sid);
+ MARIA_HA *info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR);
+ if (info == NULL)
+ {
+ tprint(tracef, ", is absent (must have been dropped later?)"
+ " or its header is so corrupted that we cannot open it;"
+ " we skip it\n");
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, "Table is crashed, can't apply log records to it\n");
+ goto end;
+ }
+ MARIA_SHARE *share= info->s;
+ /* check that we're not already using it */
+ if (share->reopen != 1)
+ {
+ tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional\n");
+ ALERT_USER();
+ error= -1;
+ goto end;
+ }
+ if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " LOGREC_FILE_ID's LSN (%lu,0x%lx), ignoring open request",
+ LSN_IN_PARTS(share->state.create_rename_lsn),
+ LSN_IN_PARTS(lsn_of_file_id));
+ error= -1;
+ goto end;
+ }
+ /* don't log any records for this work */
+ _ma_tmp_disable_logging_for_table(share);
+ /* execution of some REDO records relies on data_file_length */
+ my_off_t dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME));
+ my_off_t kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME));
+ if ((dfile_len == MY_FILEPOS_ERROR) ||
+ (kfile_len == MY_FILEPOS_ERROR))
+ {
+ tprint(tracef, ", length unknown\n");
+ goto end;
+ }
+ share->state.state.data_file_length= dfile_len;
+ share->state.state.key_file_length= kfile_len;
+ if ((dfile_len % share->block_size) > 0)
+ {
+ tprint(tracef, ", has too short last page\n");
+ /* Recovery will fix this, no error */
+ ALERT_USER();
+ }
+ /*
+ This LSN serves in this situation; assume log is:
+ FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1")
+ then crash, checkpoint record is parsed and opens "t1" with id 6; assume
+ REDO phase starts from the REDO_INSERT above: it will wrongly try to
+ update a page of "t1". With this LSN below, REDO_INSERT can realize the
+ mapping is newer than itself, and not execute.
+ Same example is possible with UNDO_INSERT (update of the state).
+ */
+ info->s->lsn_of_file_id= lsn_of_file_id;
+ all_tables[sid].info= info;
+ all_tables[sid].org_kfile= org_kfile;
+ all_tables[sid].org_dfile= org_dfile;
+ /*
+ We don't set info->s->id, it would be useless (no logging in REDO phase);
+ if you change that, know that some records in REDO phase call
+ _ma_update_create_rename_lsn() which resets info->s->id.
+ */
+ tprint(tracef, ", opened");
+ error= 0;
+end:
+ tprint(tracef, "\n");
+ if (error)
+ {
+ if (info != NULL)
+ maria_close(info);
+ if (error == -1)
+ error= 0;
+ }
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
+{
+ int error= 1;
+ uchar *buff= NULL;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ {
+ /*
+ Table was skipped at open time (because later dropped/renamed, not
+ transactional, or create_rename_lsn newer than LOGREC_FILE_ID); it is
+ not an error.
+ */
+ return 0;
+ }
+ /*
+ If REDO's LSN is > page's LSN (read from disk), we are going to modify the
+ page and change its LSN. The normal runtime code stores the UNDO's LSN
+ into the page. Here storing the REDO's LSN (rec->lsn) would work
+ (we are not writing to the log here, so don't have to "flush up to UNDO's
+ LSN"). But in a test scenario where we do updates at runtime, then remove
+ tables, apply the log and check that this results in the same table as at
+ runtime, putting the same LSN as runtime had done will decrease
+ differences. So we use the UNDO's LSN which is current_group_end_lsn.
+ */
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL)
+ {
+ tprint(tracef, "Failed to read allocate buffer for record\n");
+ goto end;
+ }
+ if (translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+ HEAD_PAGE,
+ buff + FILEID_STORE_SIZE,
+ buff +
+ FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE,
+ rec->record_length -
+ (FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE)))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL)
+{
+ int error= 1;
+ uchar *buff;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+ TAIL_PAGE,
+ buff + FILEID_STORE_SIZE,
+ buff +
+ FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE,
+ rec->record_length -
+ (FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE)))
+ goto end;
+ error= 0;
+
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+ HEAD_PAGE,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+ TAIL_PAGE,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_BLOCKS)
+{
+ int error= 1;
+ uchar *buff;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ enlarge_buffer(rec);
+
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ goto end;
+ }
+
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_purge_blocks(info, current_group_end_lsn,
+ buff + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_DELETE_ALL)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ tprint(tracef, " deleting all %lu rows\n",
+ (ulong)info->s->state.state.records);
+ if (maria_delete_all_rows(info))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+#define set_undo_lsn_for_active_trans(TRID, LSN) do { \
+ all_active_trans[TRID].undo_lsn= LSN; \
+ if (all_active_trans[TRID].first_undo_lsn == LSN_IMPOSSIBLE) \
+ all_active_trans[TRID].first_undo_lsn= LSN; } while (0)
+
+prototype_redo_exec_hook(UNDO_ROW_INSERT)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ if (info == NULL)
+ return 0;
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state older than record, updating rows' count\n");
+ info->s->state.state.records++;
+ /** @todo RECOVERY BUG Also update the table's checksum */
+ /**
+ @todo some bits below will rather be set when executing UNDOs related
+ to keys
+ */
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+ }
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_DELETE)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ if (info == NULL)
+ return 0;
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state older than record, updating rows' count\n");
+ info->s->state.state.records--;
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+ }
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_UPDATE)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ if (info == NULL)
+ return 0;
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0)
+ {
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+ }
+ return 0;
+}
+
+
+prototype_redo_exec_hook(COMMIT)
+{
+ uint16 sid= rec->short_trid;
+ TrID long_trid= all_active_trans[sid].long_trid;
+ LSN gslsn= all_active_trans[sid].group_start_lsn;
+ char llbuf[22];
+ if (long_trid == 0)
+ {
+ tprint(tracef, "We don't know about transaction with short_trid %u;"
+ "it probably committed long ago, forget it\n", sid);
+ return 0;
+ }
+ llstr(long_trid, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u committed", llbuf, sid);
+ if (gslsn != LSN_IMPOSSIBLE)
+ {
+ /*
+ It's not an error, it may be that trn got a disk error when writing to a
+ table, so an unfinished group staid in the log.
+ */
+ tprint(tracef, ", with group at LSN (%lu,0x%lx) short_trid %u aborted\n",
+ LSN_IN_PARTS(gslsn), sid);
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ }
+ else
+ tprint(tracef, "\n");
+ bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
+#ifdef MARIA_VERSIONING
+ /*
+ if real recovery:
+ transaction was committed, move it to some separate list for later
+ purging (but don't purge now! purging may have been started before, we
+ may find REDO_PURGE records soon).
+ */
+#endif
+ return 0;
+}
+
+
+prototype_redo_exec_hook(CLR_END)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ if (info == NULL)
+ return 0;
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ enum translog_record_type undone_record_type=
+ (rec->header)[LSN_STORE_SIZE + FILEID_STORE_SIZE];
+ const LOG_DESC *log_desc= &log_record_type_descriptor[undone_record_type];
+
+ set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn);
+ tprint(tracef, " CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n",
+ log_desc->name, LSN_IN_PARTS(previous_undo_lsn));
+ if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state older than record, updating rows' count\n");
+ switch (undone_record_type) {
+ case LOGREC_UNDO_ROW_DELETE:
+ info->s->state.state.records++;
+ break;
+ case LOGREC_UNDO_ROW_INSERT:
+ info->s->state.state.records--;
+ break;
+ case LOGREC_UNDO_ROW_UPDATE:
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+ }
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ return 0;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_INSERT)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+
+ if (info == NULL)
+ {
+ /*
+ Unlike for REDOs, if the table was skipped it is abnormal; we have a
+ transaction to rollback which used this table, as it is not rolled back
+ it was supposed to hold this table and so the table should still be
+ there.
+ */
+ return 1;
+ }
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+
+ info->trn= trn;
+ error= _ma_apply_undo_row_insert(info, previous_undo_lsn,
+ rec->header + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(previous_undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_DELETE)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+
+ if (info == NULL)
+ return 1;
+
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ return 1;
+ }
+
+ info->trn= trn;
+ /*
+ For now we skip the page and directory entry. This is to be used
+ later when we mark rows as deleted.
+ */
+ error= _ma_apply_undo_row_delete(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE,
+ rec->record_length -
+ (LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE));
+ info->trn= 0;
+ tprint(tracef, " rows' count %lu\n undo_lsn now LSN (%lu,0x%lx)\n",
+ (ulong)info->s->state.state.records,
+ LSN_IN_PARTS(previous_undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_UPDATE)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+
+ if (info == NULL)
+ return 1;
+
+ info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES;
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_row_update(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length -
+ (LSN_STORE_SIZE + FILEID_STORE_SIZE));
+ info->trn= 0;
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(previous_undo_lsn));
+ return error;
+}
+
+
+static int run_redo_phase(LSN lsn, my_bool apply)
+{
+ /* install hooks for execution */
+#define install_redo_exec_hook(R) \
+ log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \
+ exec_REDO_LOGREC_ ## R;
+#define install_undo_exec_hook(R) \
+ log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \
+ exec_UNDO_LOGREC_ ## R;
+ install_redo_exec_hook(LONG_TRANSACTION_ID);
+ install_redo_exec_hook(CHECKPOINT);
+ install_redo_exec_hook(REDO_CREATE_TABLE);
+ install_redo_exec_hook(REDO_RENAME_TABLE);
+ install_redo_exec_hook(REDO_REPAIR_TABLE);
+ install_redo_exec_hook(REDO_DROP_TABLE);
+ install_redo_exec_hook(FILE_ID);
+ install_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+ install_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+ install_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+ install_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+ install_redo_exec_hook(REDO_PURGE_BLOCKS);
+ install_redo_exec_hook(REDO_DELETE_ALL);
+ install_redo_exec_hook(UNDO_ROW_INSERT);
+ install_redo_exec_hook(UNDO_ROW_DELETE);
+ install_redo_exec_hook(UNDO_ROW_UPDATE);
+ install_redo_exec_hook(COMMIT);
+ install_redo_exec_hook(CLR_END);
+ install_undo_exec_hook(UNDO_ROW_INSERT);
+ install_undo_exec_hook(UNDO_ROW_DELETE);
+ install_undo_exec_hook(UNDO_ROW_UPDATE);
+
+ current_group_end_lsn= LSN_IMPOSSIBLE;
+
+ TRANSLOG_HEADER_BUFFER rec;
+
+ if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon()))
+ {
+ tprint(tracef, "checkpoint address refers to the log end log or "
+ "log is empty, nothing to do.\n");
+ return 0;
+ }
+
+ int len= translog_read_record_header(lsn, &rec);
+
+ /** @todo EOF should be detected */
+ if (len == RECHEADER_READ_ERROR)
+ {
+ tprint(tracef, "Failed to read header of the first record.\n");
+ return 1;
+ }
+ struct st_translog_scanner_data scanner;
+ if (translog_init_scanner(lsn, 1, &scanner))
+ {
+ tprint(tracef, "Scanner init failed\n");
+ return 1;
+ }
+ uint i;
+ for (i= 1;;i++)
+ {
+ uint16 sid= rec.short_trid;
+ const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type];
+ display_record_position(log_desc, &rec, i);
+ /*
+ A complete group is a set of log records with an "end mark" record
+ (e.g. a set of REDOs for an operation, terminated by an UNDO for this
+ operation); if there is no "end mark" record the group is incomplete
+ and won't be executed.
+ */
+ if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) ||
+ (log_desc->record_in_group == LOGREC_LAST_IN_GROUP))
+ {
+ if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE)
+ {
+ if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF)
+ {
+ /*
+ can happen if the transaction got a table write error, then
+ unlocked tables thus wrote a COMMIT record.
+ */
+ tprint(tracef, "\nDiscarding unfinished group before this record\n");
+ ALERT_USER();
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ }
+ else
+ {
+ /*
+ There is a complete group for this transaction, containing more
+ than this event.
+ */
+ tprint(tracef, " ends a group:\n");
+ struct st_translog_scanner_data scanner2;
+ TRANSLOG_HEADER_BUFFER rec2;
+ len=
+ translog_read_record_header(all_active_trans[sid].group_start_lsn, &rec2);
+ if (len < 0) /* EOF or error */
+ {
+ tprint(tracef, "Cannot find record where it should be\n");
+ return 1;
+ }
+ if (translog_init_scanner(rec2.lsn, 1, &scanner2))
+ {
+ tprint(tracef, "Scanner2 init failed\n");
+ return 1;
+ }
+ current_group_end_lsn= rec.lsn;
+ do
+ {
+ if (rec2.short_trid == sid) /* it's in our group */
+ {
+ const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type];
+ display_record_position(log_desc2, &rec2, 0);
+ if (apply && display_and_apply_record(log_desc2, &rec2))
+ return 1;
+ }
+ len= translog_read_next_record_header(&scanner2, &rec2);
+ if (len < 0) /* EOF or error */
+ {
+ tprint(tracef, "Cannot find record where it should be\n");
+ return 1;
+ }
+ }
+ while (rec2.lsn < rec.lsn);
+ translog_free_record_header(&rec2);
+ /* group finished */
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */
+ display_record_position(log_desc, &rec, 0);
+ }
+ }
+ if (apply && display_and_apply_record(log_desc, &rec))
+ return 1;
+ }
+ else /* record does not end group */
+ {
+ /* just record the fact, can't know if can execute yet */
+ if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE)
+ {
+ /* group not yet started */
+ all_active_trans[sid].group_start_lsn= rec.lsn;
+ }
+ }
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len < 0)
+ {
+ switch (len)
+ {
+ case RECHEADER_READ_EOF:
+ tprint(tracef, "EOF on the log\n");
+ break;
+ case RECHEADER_READ_ERROR:
+ tprint(tracef, "Error reading log\n");
+ return 1;
+ }
+ break;
+ }
+ }
+ translog_free_record_header(&rec);
+ return 0;
+}
+
+
+/**
+ @brief Informs about any aborted groups or unfinished transactions,
+ prepares for the UNDO phase if needed.
+
+ @param prepare_for_undo_phase
+
+ @note Observe that it may init trnman.
+*/
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase)
+{
+ uint sid, unfinished= 0;
+ char llbuf[22];
+
+ hash_free(&all_dirty_pages);
+ /*
+ hash_free() can be called multiple times probably, but be safe it that
+ changes
+ */
+ bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+ my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+ dirty_pages_pool= NULL;
+
+ llstr(max_long_trid, llbuf);
+ tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf);
+ if (prepare_for_undo_phase && trnman_init(max_long_trid))
+ return -1;
+
+ for (sid= 0; sid <= SHORT_TRID_MAX; sid++)
+ {
+ TrID long_trid= all_active_trans[sid].long_trid;
+ LSN gslsn= all_active_trans[sid].group_start_lsn;
+ TRN *trn;
+ if (gslsn != LSN_IMPOSSIBLE)
+ {
+ tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u aborted\n",
+ LSN_IN_PARTS(gslsn), sid);
+ ALERT_USER();
+ }
+ if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE)
+ {
+ char llbuf[22];
+ llstr(long_trid, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u unfinished\n",
+ llbuf, sid);
+ /* dummy_transaction_object serves only for DDLs */
+ DBUG_ASSERT(long_trid != 0);
+ if (prepare_for_undo_phase)
+ {
+ if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL)
+ return -1;
+ trn->undo_lsn= all_active_trans[sid].undo_lsn;
+ trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn |
+ TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */
+ }
+ /* otherwise we will just warn about it */
+ unfinished++;
+ }
+#ifdef MARIA_VERSIONING
+ /*
+ If real recovery: if transaction was committed, move it to some separate
+ list for soon purging.
+ */
+#endif
+ }
+
+ my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+ all_active_trans= NULL;
+
+ /*
+ The UNDO phase uses some normal run-time code of ROLLBACK: generates log
+ records, etc; prepare tables for that
+ */
+ LSN addr= translog_get_horizon();
+ for (sid= 0; sid <= SHARE_ID_MAX; sid++)
+ {
+ MARIA_HA *info= all_tables[sid].info;
+ if (info != NULL)
+ {
+ prepare_table_for_close(info, addr);
+ /*
+ But we don't close it; we leave it available for the UNDO phase;
+ it's likely that the UNDO phase will need it.
+ */
+ if (prepare_for_undo_phase)
+ translog_assign_id_to_share_from_recovery(info->s, sid);
+ }
+ }
+
+#if 0 /* will be enabled soon */
+ if (prepare_for_undo_phase)
+ {
+ /*
+ We take a checkpoint as it can save future recovery work if we crash
+ soon. But we don't flush pages, as UNDOs would change them again
+ probably.
+ */
+ if (ma_checkpoint_init(FALSE))
+ return -1;
+ int res= ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE);
+ ma_checkpoint_end();
+ if (res)
+ unfinished= -1;
+ }
+#endif
+
+ return unfinished;
+}
+
+
+static int run_undo_phase(uint unfinished)
+{
+ if (unfinished > 0)
+ {
+ if (tracef != stdout)
+ {
+ /** @todo RECOVERY BUG all prints to stderr should go to error log */
+ fprintf(stderr, " 100%%; transactions to roll back:");
+ }
+ tprint(tracef, "%u transactions will be rolled back\n", unfinished);
+ for( ; ; )
+ {
+ if (tracef != stdout)
+ fprintf(stderr, " %u", unfinished);
+ if ((unfinished--) == 0)
+ break;
+ char llbuf[22];
+ TRN *trn= trnman_get_any_trn();
+ DBUG_ASSERT(trn != NULL);
+ llstr(trn->trid, llbuf);
+ tprint(tracef, "Rolling back transaction of long id %s\n", llbuf);
+
+ /* Execute all undo entries */
+ while (trn->undo_lsn)
+ {
+ TRANSLOG_HEADER_BUFFER rec;
+ LOG_DESC *log_desc;
+ if (translog_read_record_header(trn->undo_lsn, &rec) ==
+ RECHEADER_READ_ERROR)
+ return 1;
+ log_desc= &log_record_type_descriptor[rec.type];
+ display_record_position(log_desc, &rec, 0);
+ if (log_desc->record_execute_in_undo_phase(&rec, trn))
+ {
+ tprint(tracef, "Got error when executing undo\n");
+ return 1;
+ }
+ }
+
+ if (trnman_rollback_trn(trn))
+ return 1;
+ /* We could want to span a few threads (4?) instead of 1 */
+ /* In the future, we want to have this phase *online* */
+ }
+ }
+ return 0;
+}
+
+
+/**
+ @brief re-enables transactionality, updates is_of_horizon
+
+ @param info table
+ @param horizon address to set is_of_horizon
+*/
+
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon)
+{
+ MARIA_SHARE *share= info->s;
+ /*
+ In a fully-forward REDO phase (no checkpoint record),
+ state is now at least as new as the LSN of the current record. It may be
+ newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a
+ table, but that table was later modified further in the log.
+ But if we parsed a checkpoint record, it may be this way in the log:
+ FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1)
+ Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to
+ make t1 close; the first condition below is however false (when checkpoint
+ was taken it increased is_of_horizon) and so it works. For safety we
+ add the second condition.
+ */
+ if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 &&
+ cmp_translog_addr(share->lsn_of_file_id, horizon) < 0)
+ share->state.is_of_horizon= horizon;
+ _ma_reenable_logging_for_table(share);
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec)
+{
+ uint16 sid;
+ pgcache_page_no_t page;
+ MARIA_HA *info;
+ char llbuf[22];
+
+ print_redo_phase_progress(rec->lsn);
+ sid= fileid_korr(rec->header);
+ page= page_korr(rec->header + FILEID_STORE_SIZE);
+ /**
+ @todo RECOVERY BUG
+ - for REDO_PURGE_BLOCKS, page is not at this pos
+ - for DELETE_ALL, record ends here! buffer overrun!
+ Solution: caller should pass a param enum { i_am_about_data_file,
+ i_am_about_index_file, none }.
+ */
+ llstr(page, llbuf);
+ tprint(tracef, " For page %s of table of short id %u", llbuf, sid);
+ info= all_tables[sid].info;
+ if (info == NULL)
+ {
+ tprint(tracef, ", table skipped, so skipping record\n");
+ return NULL;
+ }
+ tprint(tracef, ", '%s'", info->s->open_file_name);
+ if (cmp_translog_addr(rec->lsn, info->s->lsn_of_file_id) <= 0)
+ {
+ /*
+ This can happen only if processing a record before the checkpoint
+ record.
+ id->name mapping is newer than REDO record: for sure the table subject
+ of the REDO has been flushed and forced (id re-assignment implies this);
+ REDO can be ignored (and must be, as we don't know what this subject
+ table was).
+ */
+ DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0);
+ tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+ " than record, skipping record",
+ LSN_IN_PARTS(info->s->lsn_of_file_id));
+ return NULL;
+ }
+ /* detect if an open instance of a dropped table (internal bug) */
+ DBUG_ASSERT(info->s->last_version != 0);
+ if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0)
+ {
+ /**
+ @todo RECOVERY BUG always assuming this is REDO for data file, but it
+ could soon be index file
+ */
+ uint64 file_and_page_id=
+ (((uint64)all_tables[sid].org_dfile) << 32) | page;
+ struct st_dirty_page *dirty_page= (struct st_dirty_page *)
+ hash_search(&all_dirty_pages,
+ (uchar *)&file_and_page_id, sizeof(file_and_page_id));
+ if ((dirty_page == NULL) ||
+ cmp_translog_addr(rec->lsn, dirty_page->rec_lsn) < 0)
+ {
+ tprint(tracef, ", ignoring because of dirty_pages list\n");
+ return NULL;
+ }
+ }
+
+ /*
+ So we are going to read the page, and if its LSN is older than the
+ record's we will modify the page
+ */
+ tprint(tracef, ", applying record\n");
+ _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+ return info;
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec)
+{
+ uint16 sid;
+ MARIA_HA *info;
+
+ sid= fileid_korr(rec->header + LSN_STORE_SIZE);
+ tprint(tracef, " For table of short id %u", sid);
+ info= all_tables[sid].info;
+ if (info == NULL)
+ {
+ tprint(tracef, ", table skipped, so skipping record\n");
+ return NULL;
+ }
+ tprint(tracef, ", '%s'", info->s->open_file_name);
+ if (cmp_translog_addr(rec->lsn, info->s->lsn_of_file_id) <= 0)
+ {
+ tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+ " than record, skipping record",
+ LSN_IN_PARTS(info->s->lsn_of_file_id));
+ return NULL;
+ }
+ DBUG_ASSERT(info->s->last_version != 0);
+ _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+ tprint(tracef, ", applying record\n");
+ return info;
+}
+
+
+/**
+ @brief Parses checkpoint record.
+
+ Builds from it the dirty_pages list (a hash), opens tables and maps them to
+ their 2-byte IDs, recreates transactions (not real TRNs though).
+
+ @return From where in the log the REDO phase should start
+ @retval LSN_IMPOSSIBLE error
+ @retval other ok
+*/
+
+static LSN parse_checkpoint_record(LSN lsn)
+{
+ uint i;
+ TRANSLOG_HEADER_BUFFER rec;
+
+ tprint(tracef, "Loading data from checkpoint record at LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(lsn));
+ int len= translog_read_record_header(lsn, &rec);
+
+ if (len == RECHEADER_READ_ERROR)
+ {
+ tprint(tracef, "Cannot find checkpoint record where it should be\n");
+ return LSN_IMPOSSIBLE;
+ }
+
+ enlarge_buffer(&rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec.lsn, 0, rec.record_length,
+ log_record_buffer.str, NULL) !=
+ rec.record_length)
+ {
+ tprint(tracef, "Failed to read record\n");
+ return LSN_IMPOSSIBLE;
+ }
+
+ char *ptr= log_record_buffer.str;
+ checkpoint_start= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+
+ /* transactions */
+ uint nb_active_transactions= uint2korr(ptr);
+ ptr+= 2;
+ tprint(tracef, "%u active transactions\n", nb_active_transactions);
+ LSN minimum_rec_lsn_of_active_transactions= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+
+ /*
+ how much brain juice and discussions there was to come to writing this
+ line
+ */
+ set_if_smaller(checkpoint_start, minimum_rec_lsn_of_active_transactions);
+
+ for (i= 0; i < nb_active_transactions; i++)
+ {
+ uint16 sid= uint2korr(ptr);
+ ptr+= 2;
+ TrID long_id= uint6korr(ptr);
+ ptr+= 6;
+ DBUG_ASSERT(sid > 0 && long_id > 0);
+ LSN undo_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ LSN first_undo_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ new_transaction(sid, long_id, undo_lsn, first_undo_lsn);
+ }
+ uint nb_committed_transactions= uint4korr(ptr);
+ ptr+= 4;
+ tprint(tracef, "%lu committed transactions\n",
+ (ulong)nb_committed_transactions);
+ /* no purging => committed transactions are not important */
+ ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions;
+
+ /* tables */
+ uint nb_tables= uint4korr(ptr);
+ ptr+= 4;
+ tprint(tracef, "%u open tables\n", nb_tables);
+ for (i= 0; i< nb_tables; i++)
+ {
+ char name[FN_REFLEN];
+ uint16 sid= uint2korr(ptr);
+ ptr+= 2;
+ DBUG_ASSERT(sid > 0);
+ File kfile= uint4korr(ptr);
+ ptr+= 4;
+ File dfile= uint4korr(ptr);
+ ptr+= 4;
+ LSN first_log_write_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ uint name_len= strlen(ptr) + 1;
+ ptr+= name_len;
+ strnmov(name, ptr, sizeof(name));
+ if (new_table(sid, name, kfile, dfile, first_log_write_lsn))
+ return LSN_IMPOSSIBLE;
+ }
+
+ /* dirty pages */
+ uint nb_dirty_pages= uint4korr(ptr);
+ ptr+= 4;
+ tprint(tracef, "%u dirty pages\n", nb_dirty_pages);
+ if (hash_init(&all_dirty_pages, &my_charset_bin, nb_dirty_pages,
+ offsetof(struct st_dirty_page, file_and_page_id),
+ sizeof(((struct st_dirty_page *)NULL)->file_and_page_id),
+ NULL, NULL, 0))
+ return LSN_IMPOSSIBLE;
+ dirty_pages_pool=
+ (struct st_dirty_page *)my_malloc(nb_dirty_pages *
+ sizeof(struct st_dirty_page),
+ MYF(MY_WME));
+ if (unlikely(dirty_pages_pool == NULL))
+ return LSN_IMPOSSIBLE;
+ struct st_dirty_page *next_dirty_page_in_pool= dirty_pages_pool;
+ LSN minimum_rec_lsn_of_dirty_pages= LSN_MAX;
+ for (i= 0; i < nb_dirty_pages ; i++)
+ {
+ File fileid= uint4korr(ptr);
+ ptr+= 4;
+ pgcache_page_no_t pageid= uint4korr(ptr);
+ ptr+= 4;
+ LSN rec_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ if (new_page(fileid, pageid, rec_lsn, next_dirty_page_in_pool++))
+ return LSN_IMPOSSIBLE;
+ set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn);
+ }
+ /* after that, there will be no insert/delete into the hash */
+ /*
+ sanity check on record (did we screw up with all those "ptr+=", did the
+ checkpoint write code and checkpoint read code go out of sync?).
+ */
+ if (ptr != (log_record_buffer.str + log_record_buffer.length))
+ {
+ tprint(tracef, "checkpoint record corrupted\n");
+ return LSN_IMPOSSIBLE;
+ }
+ set_if_smaller(checkpoint_start, minimum_rec_lsn_of_dirty_pages);
+
+ return checkpoint_start;
+}
+
+static int new_page(File fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+ struct st_dirty_page *dirty_page)
+{
+ /* serves as hash key */
+ dirty_page->file_and_page_id= (((uint64)fileid) << 32) | pageid;
+ dirty_page->rec_lsn= rec_lsn;
+ return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page);
+}
+
+
+static int close_all_tables(void)
+{
+ int error= 0;
+ LIST *list_element, *next_open;
+ MARIA_HA *info;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ if (maria_open_list == NULL)
+ goto end;
+ tprint(tracef, "Closing all tables\n");
+ if (tracef != stdout && redo_phase_message_printed)
+ {
+ /** @todo RECOVERY BUG all prints to stderr should go to error log */
+ fprintf(stderr, "; flushing tables");
+ }
+
+ /*
+ Since the end of end_of_redo_phase(), we may have written new records
+ (if UNDO phase ran) and thus the state is newer than at
+ end_of_redo_phase(), we need to bump is_of_horizon again.
+ */
+ TRANSLOG_ADDRESS addr= translog_get_horizon();
+ for (list_element= maria_open_list ; list_element ; list_element= next_open)
+ {
+ next_open= list_element->next;
+ info= (MARIA_HA*)list_element->data;
+ pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */
+ prepare_table_for_close(info, addr);
+ error|= maria_close(info);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ }
+end:
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ return error;
+}
+
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr)
+{
+ static int end_logno= FILENO_IMPOSSIBLE, end_offset, percentage_printed= 0;
+ static ulonglong initial_remainder= -1;
+ if (tracef == stdout)
+ return;
+ if (!redo_phase_message_printed)
+ {
+ /** @todo RECOVERY BUG all prints to stderr should go to error log */
+ fprintf(stderr, "Maria engine: starting recovery; recovered pages: 0%%");
+ redo_phase_message_printed= TRUE;
+ }
+ if (end_logno == FILENO_IMPOSSIBLE)
+ {
+ LSN end_addr= translog_get_horizon();
+ end_logno= LSN_FILE_NO(end_addr);
+ end_offset= LSN_OFFSET(end_addr);
+ }
+ int cur_logno= LSN_FILE_NO(addr);
+ int cur_offset= LSN_OFFSET(addr);
+ ulonglong remainder;
+ remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) :
+ (TRANSLOG_FILE_SIZE - cur_offset +
+ max(end_logno - cur_logno - 1, 0) * TRANSLOG_FILE_SIZE + end_offset);
+ if (initial_remainder == (ulonglong)(-1))
+ initial_remainder= remainder;
+ int percentage_done=
+ (initial_remainder - remainder) * ULL(100) / initial_remainder;
+ if ((percentage_done - percentage_printed) >= 10)
+ {
+ percentage_printed= percentage_done;
+ fprintf(stderr, " %d%%", percentage_done);
+ }
+}
+
+#ifdef MARIA_EXTERNAL_LOCKING
+#error Maria's Checkpoint and Recovery are really not ready for it
+#endif
+
+/*
+Recovery of the state : how it works
+=====================================
+
+Here we ignore Checkpoints for a start.
+
+The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in
+memory frequently (at least at every row write/update/delete) but goes
+to disk at few moments: maria_close() when closing the last open
+instance, and a few rare places like CHECK/REPAIR/ALTER
+(non-transactional tables also do it at maria_lock_database() but we
+needn't cover them here).
+
+In case of crash, state on disk is likely to be older than what it was
+in memory, the REDO phase needs to recreate the state as it was in
+memory at the time of crash. When we say Recovery here we will always
+mean "REDO phase".
+
+For example MARIA_STATUS_INFO::records (count of records). It is updated at
+the end of every row write/update/delete/delete_all. When Recovery sees the
+sign of such row operation (UNDO or REDO), it may need to update the records'
+count if that count does not reflect that operation (is older). How to know
+the age of the state compared to the log record: every time the state
+goes to disk at runtime, its member "is_of_horizon" is updated to the
+current end-of-log horizon. So Recovery just needs to compare is_of_horizon
+and the record's LSN to know if it should modify "records".
+
+Other operations like ALTER TABLE DISABLE KEYS update the state but
+don't write log records, thus the REDO phase cannot repeat their
+effect on the state in case of crash. But we make them sync the state
+as soon as they have finished. This reduces the window for a problem.
+
+It looks like only one thread at a time updates the state in memory or
+on disk. However there is not 100% certainty when it comes to
+HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME): can they read the state
+from memory while some other thread is updating "records" in memory?
+If yes, they may write a corrupted state to disk.
+We assume that no for now: ASK_MONTY.
+
+With checkpoints
+================
+
+Checkpoint module needs to read the state in memory and write it to
+disk. This may happen while some other thread is modifying the state
+in memory or on disk. Checkpoint thus may be reading changing data, it
+needs a mutex to not have it corrupted, and concurrent modifiers of
+the state need that mutex too for the same reason.
+"records" is modified for every row write/update/delete, we don't want
+to add a mutex lock/unlock there. So we re-use the mutex lock/unlock
+which is already present in these moments, namely the log's mutex which is
+taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in
+under-log-mutex hooks when writing these records (thus "records" is
+not updated at the end of maria_write/update/delete() anymore).
+Thus Checkpoint takes the log's lock and can read "records" from
+memory an write it to disk and release log's lock.
+We however want to avoid having the disk write under the log's
+lock. So it has to be under another mutex, natural choice is
+intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile,
+and as maria_close() takes it too). All state writes to disk are
+changed to be protected with intern_lock.
+So Checkpoint takes intern_lock, log's lock, reads "records" from
+memory, releases log's lock, updates is_of_horizon and writes "records" to
+disk, release intern_lock.
+In practice, not only "records" needs to be written but the full
+state. So, Checkpoint reads the full state from memory. Some other
+thread may at this moment be modifying in memory some pieces of the
+state which are not protected by the lock's log (see ma_extra.c
+HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state
+from memory; to guard against that we extend the intern_lock-zone to
+changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and
+also any change made in memory to create_rename_lsn/state_is_of_horizon.
+Last, we don't want in Checkpoint to do
+ log lock; read state from memory; release log lock;
+for each table, it may hold the log's lock too much in total.
+So, we instead do
+ log lock; read N states from memory; release log lock;
+Thus, the sequence above happens outside of any intern_lock.
+But this re-introduces the problem that some other thread may be changing the
+state in memory and on disk under intern_lock, without log's lock, like
+HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later
+comes to handling the table under intern_lock, which is serialized with
+HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state
+was read from memory under log's lock, and thus can decide to not flush the
+obsolete state it has, knowing that the other thread flushed a more recent
+state already. If on the other hand is_of_horizon is not higher, the read
+state is current and can be flushed. So we have a per-table sequence:
+ lock intern_lock; test if is_of_horizon is higher than when we read the state
+ under log's lock; if no then flush the read state to disk.
+*/
+
+/* some comments and pseudo-code which we keep for later */
+#if 0
+ /*
+ MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
+ after a certain amount of log records have been executed. This helps
+ against repeated crashes. Those checkpoints could not be user-requested
+ (as engine is not communicating during the REDO phase), so they would be
+ automatic: this changes the original assumption that we don't write to the
+ log while in the REDO phase, but why not. How often should we checkpoint?
+ */
+
+ /*
+ We want to have two steps:
+ engine->recover_with_max_memory();
+ next_engine->recover_with_max_memory();
+ engine->init_with_normal_memory();
+ next_engine->init_with_normal_memory();
+ So: in recover_with_max_memory() allocate a giant page cache, do REDO
+ phase, then all page cache is flushed and emptied and freed (only retain
+ small structures like TM): take full checkpoint, which is useful if
+ next engine crashes in its recovery the next second.
+ Destroy all shares (maria_close()), then at init_with_normal_memory() we
+ do this:
+ */
+
+ /**** UNDO PHASE *****/
+
+ /*
+ Launch one or more threads to do the background rollback. Don't wait for
+ them to complete their rollback (background rollback; for debugging, we
+ can have an option which waits). Set a counter (total_of_rollback_threads)
+ to the number of threads to lauch.
+
+ Note that InnoDB's rollback-in-background works as long as InnoDB is the
+ last engine to recover, otherwise MySQL will refuse new connections until
+ the last engine has recovered so it's not "background" from the user's
+ point of view. InnoDB is near top of sys_table_types so all others
+ (e.g. BDB) recover after it... So it's really "online rollback" only if
+ InnoDB is the only engine.
+ */
+
+ /* wake up delete/update handler */
+ /* tell the TM that it can now accept new transactions */
+
+ /*
+ mark that checkpoint requests are now allowed.
+ */
+#endif
diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h
new file mode 100644
index 00000000000..e3864d6022b
--- /dev/null
+++ b/storage/maria/ma_recovery.h
@@ -0,0 +1,30 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3072 Maria recovery
+ First version written by Guilhem Bichot on 2006-04-27.
+ Does not compile yet.
+*/
+
+/* This is the interface of this module. */
+
+/* Performs recovery of the engine at start */
+
+C_MODE_START
+int maria_recover(void);
+int maria_apply_log(LSN lsn, my_bool apply, FILE *trace_file,
+ my_bool execute_undo_phase, my_bool skip_DDLs);
+C_MODE_END
diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c
new file mode 100644
index 00000000000..44cd60711da
--- /dev/null
+++ b/storage/maria/ma_rename.c
@@ -0,0 +1,139 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Rename a table
+*/
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+ @brief renames a table
+
+ @param old_name current name of table
+ @param new_name table should be renamed to this name
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_rename(const char *old_name, const char *new_name)
+{
+ char from[FN_REFLEN],to[FN_REFLEN];
+ int data_file_rename_error;
+#ifdef USE_RAID
+ uint raid_type=0,raid_chunks=0;
+#endif
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+ myf sync_dir;
+ DBUG_ENTER("maria_rename");
+
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(old_name,"rename old_table");
+ _ma_check_table_is_closed(new_name,"rename new table2");
+#endif
+ /** @todo LOCK take X-lock on table */
+ if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR)))
+ DBUG_RETURN(my_errno);
+ share= info->s;
+#ifdef USE_RAID
+ raid_type = share->base.raid_type;
+ raid_chunks = share->base.raid_chunks;
+#endif
+
+ /*
+ the renaming of an internal table to the final table (like in ALTER TABLE)
+ is the moment when this table receives its correct create_rename_lsn and
+ this is important; make sure transactionality has been re-enabled.
+ */
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ sync_dir= (share->now_transactional && !share->temporary &&
+ !maria_in_recovery) ? MY_SYNC_DIR : 0;
+ if (sync_dir)
+ {
+ LSN lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ uint old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)old_name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char *)new_name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len;
+ /*
+ For this record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe, which it is not now (that would require
+ work using the ddl_log of sql/sql_table.cc); when it is, we should
+ reconsider the moment of writing this log record (before or after op,
+ under THR_LOCK_maria or not...), how to use it in Recovery.
+ For now it can serve to apply logs to a backup so we sync it.
+ */
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE,
+ &dummy_transaction_object, NULL,
+ old_name_len + new_name_len,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL) ||
+ translog_flush(lsn)))
+ {
+ maria_close(info);
+ DBUG_RETURN(1);
+ }
+ /*
+ store LSN into file, needed for Recovery to not be confused if a
+ RENAME happened (applying REDOs to the wrong table).
+ */
+ if (_ma_update_create_rename_lsn(share, lsn, TRUE))
+ {
+ maria_close(info);
+ DBUG_RETURN(1);
+ }
+ }
+
+ maria_close(info);
+#ifdef USE_RAID
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(old_name,"rename raidcheck");
+#endif
+#endif /* USE_RAID */
+
+ fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ if (my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir)))
+ DBUG_RETURN(my_errno);
+ fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+ if (raid_type)
+ data_file_rename_error= my_raid_rename(from, to, raid_chunks,
+ MYF(MY_WME | sync_dir));
+ else
+#endif
+ data_file_rename_error=
+ my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir));
+ if (data_file_rename_error)
+ {
+ /*
+ now we have a renamed index file and a non-renamed data file, try to
+ undo the rename of the index file.
+ */
+ data_file_rename_error= my_errno;
+ fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+ fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+ my_rename_with_symlink(to, from, MYF(MY_WME | sync_dir));
+ }
+ DBUG_RETURN(data_file_rename_error);
+
+}
diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c
new file mode 100644
index 00000000000..226aaa551f0
--- /dev/null
+++ b/storage/maria/ma_rfirst.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /* Read first row through a specfic key */
+
+int maria_rfirst(MARIA_HA *info, uchar *buf, int inx)
+{
+ DBUG_ENTER("maria_rfirst");
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_PREV_FOUND;
+ DBUG_RETURN(maria_rnext(info,buf,inx));
+} /* maria_rfirst */
diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c
new file mode 100644
index 00000000000..c9653d30110
--- /dev/null
+++ b/storage/maria/ma_rkey.c
@@ -0,0 +1,178 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read record based on a key */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+ /* Read a record using key */
+ /* Ordinary search_flag is 0 ; Give error if no record with key */
+
+int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key,
+ key_part_map keypart_map, enum ha_rkey_function search_flag)
+{
+ uchar *key_buff;
+ MARIA_SHARE *share=info->s;
+ MARIA_KEYDEF *keyinfo;
+ HA_KEYSEG *last_used_keyseg;
+ uint pack_key_length, use_key_length, nextflag;
+ DBUG_ENTER("maria_rkey");
+ DBUG_PRINT("enter", ("base: 0x%lx buf: 0x%lx inx: %d search_flag: %d",
+ (long) info, (long) buf, inx, search_flag));
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->last_key_func= search_flag;
+ keyinfo= share->keyinfo + inx;
+
+ if (info->once_flags & USE_PACKED_KEYS)
+ {
+ info->once_flags&= ~USE_PACKED_KEYS; /* Reset flag */
+ /*
+ key is already packed!; This happens when we are using a MERGE TABLE
+ */
+ key_buff= info->lastkey+info->s->base.max_key_length;
+ pack_key_length= keypart_map;
+ bmove(key_buff, key, pack_key_length);
+ last_used_keyseg= info->s->keyinfo[inx].seg + info->last_used_keyseg;
+ }
+ else
+ {
+ DBUG_ASSERT(keypart_map);
+ /* Save the packed key for later use in the second buffer of lastkey. */
+ key_buff=info->lastkey+info->s->base.max_key_length;
+ pack_key_length= _ma_pack_key(info,(uint) inx, key_buff, key,
+ keypart_map, &last_used_keyseg);
+ /* Save packed_key_length for use by the MERGE engine. */
+ info->pack_key_length= pack_key_length;
+ info->last_used_keyseg= (uint16) (last_used_keyseg -
+ info->s->keyinfo[inx].seg);
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, keyinfo->seg,
+ key_buff, pack_key_length););
+ }
+
+ if (fast_ma_readinfo(info))
+ goto err;
+ if (share->concurrent_insert)
+ rw_rdlock(&share->key_root_lock[inx]);
+
+ nextflag=maria_read_vec[search_flag];
+ use_key_length=pack_key_length;
+ if (!(nextflag & (SEARCH_FIND | SEARCH_NO_FIND | SEARCH_LAST)))
+ use_key_length=USE_WHOLE_KEY;
+
+ switch (info->s->keyinfo[inx].key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ if (maria_rtree_find_first(info,inx,key_buff,use_key_length,nextflag) < 0)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno= HA_ERR_CRASHED;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ }
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!_ma_search(info, keyinfo, key_buff, use_key_length,
+ maria_read_vec[search_flag],
+ info->s->state.key_root[inx]) &&
+ share->concurrent_insert)
+ {
+ /*
+ If we searching for a partial key (or using >, >=, < or <=) and
+ the data is outside of the data file, we need to continue searching
+ for the first key inside the data file
+ */
+ if (info->cur_row.lastpos >= info->state->data_file_length &&
+ (search_flag != HA_READ_KEY_EXACT ||
+ last_used_keyseg != keyinfo->seg + keyinfo->keysegs))
+ {
+ do
+ {
+ uint not_used[2];
+ /*
+ Skip rows that are inserted by other threads since we got a lock
+ Note that this can only happen if we are not searching after an
+ full length exact key, because the keys are sorted
+ according to position
+ */
+ if (_ma_search_next(info, keyinfo, info->lastkey,
+ info->lastkey_length,
+ maria_readnext_vec[search_flag],
+ info->s->state.key_root[inx]))
+ break;
+ /*
+ Check that the found key does still match the search.
+ _ma_search_next() delivers the next key regardless of its
+ value.
+ */
+ if (search_flag == HA_READ_KEY_EXACT &&
+ ha_key_cmp(keyinfo->seg, (uchar*) key_buff,
+ (uchar*) info->lastkey, use_key_length,
+ SEARCH_FIND, not_used))
+ {
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+ } while (info->cur_row.lastpos >= info->state->data_file_length);
+ }
+ }
+ }
+ if (share->concurrent_insert)
+ rw_unlock(&share->key_root_lock[inx]);
+
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR)
+ {
+ fast_ma_writeinfo(info);
+ goto err;
+ }
+
+ /* Calculate length of the found key; Used by maria_rnext_same */
+ if ((keyinfo->flag & HA_VAR_LENGTH_KEY) && last_used_keyseg)
+ info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey,
+ last_used_keyseg);
+ else
+ info->last_rkey_length= pack_key_length;
+
+ /* Check if we don't want to have record back, only error message */
+ if (!buf)
+ {
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(0);
+ }
+ if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+
+ info->cur_row.lastpos= HA_OFFSET_ERROR; /* Didn't find row */
+
+err:
+ /* Store last used key as a base for read next */
+ memcpy(info->lastkey,key_buff,pack_key_length);
+ info->last_rkey_length= pack_key_length;
+ bzero((char*) info->lastkey+pack_key_length,info->s->base.rec_reflength);
+ info->lastkey_length=pack_key_length+info->s->base.rec_reflength;
+
+ if (search_flag == HA_READ_AFTER_KEY)
+ info->update|=HA_STATE_NEXT_FOUND; /* Previous gives last row */
+ DBUG_RETURN(my_errno);
+} /* _ma_rkey */
diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c
new file mode 100644
index 00000000000..a9a470d37d9
--- /dev/null
+++ b/storage/maria/ma_rlast.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /* Read last row with the same key as the previous read. */
+
+int maria_rlast(MARIA_HA *info, uchar *buf, int inx)
+{
+ DBUG_ENTER("maria_rlast");
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_NEXT_FOUND;
+ DBUG_RETURN(maria_rprev(info,buf,inx));
+} /* maria_rlast */
diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c
new file mode 100644
index 00000000000..fcc0f1f6a90
--- /dev/null
+++ b/storage/maria/ma_rnext.c
@@ -0,0 +1,122 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#include "ma_rt_index.h"
+
+ /*
+ Read next row with the same key as previous read
+ One may have done a write, update or delete of the previous row.
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+ */
+
+int maria_rnext(MARIA_HA *info, uchar *buf, int inx)
+{
+ int error,changed;
+ uint flag;
+ DBUG_ENTER("maria_rnext");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+ flag=SEARCH_BIGGER; /* Read next */
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+ info->update & HA_STATE_PREV_FOUND)
+ flag=0; /* Read first */
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->key_root_lock[inx]);
+ changed= _ma_test_if_changed(info);
+ if (!flag)
+ {
+ switch(info->s->keyinfo[inx].key_alg){
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ error=maria_rtree_get_first(info,inx,info->lastkey_length);
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ error= _ma_search_first(info,info->s->keyinfo+inx,
+ info->s->state.key_root[inx]);
+ break;
+ }
+ }
+ else
+ {
+ switch (info->s->keyinfo[inx].key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ /*
+ Note that rtree doesn't support that the table
+ may be changed since last call, so we do need
+ to skip rows inserted by other threads like in btree
+ */
+ error= maria_rtree_get_next(info,inx,info->lastkey_length);
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!changed)
+ error= _ma_search_next(info,info->s->keyinfo+inx,info->lastkey,
+ info->lastkey_length,flag,
+ info->s->state.key_root[inx]);
+ else
+ error= _ma_search(info,info->s->keyinfo+inx,info->lastkey,
+ USE_WHOLE_KEY,flag, info->s->state.key_root[inx]);
+ }
+ }
+
+ if (info->s->concurrent_insert)
+ {
+ if (!error)
+ {
+ while (info->cur_row.lastpos >= info->state->data_file_length)
+ {
+ /* Skip rows inserted by other threads since we got a lock */
+ if ((error= _ma_search_next(info,info->s->keyinfo+inx,
+ info->lastkey,
+ info->lastkey_length,
+ SEARCH_BIGGER,
+ info->s->state.key_root[inx])))
+ break;
+ }
+ }
+ rw_unlock(&info->s->key_root_lock[inx]);
+ }
+ /* Don't clear if database-changed */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_NEXT_FOUND;
+
+ if (error)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("error",("Got error: %d, errno: %d",error, my_errno));
+ DBUG_RETURN(my_errno);
+} /* maria_rnext */
diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c
new file mode 100644
index 00000000000..6782cf5b8cf
--- /dev/null
+++ b/storage/maria/ma_rnext_same.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+/*
+ Read next row with the same key as previous read, but abort if
+ the key changes.
+ One may have done a write, update or delete of the previous row.
+
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+*/
+
+int maria_rnext_same(MARIA_HA *info, uchar *buf)
+{
+ int error;
+ uint inx,not_used[2];
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_rnext_same");
+
+ if ((int) (inx= info->lastinx) < 0 ||
+ info->cur_row.lastpos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ keyinfo= info->s->keyinfo+inx;
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->key_root_lock[inx]);
+
+ switch (keyinfo->key_alg)
+ {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ if ((error=maria_rtree_find_next(info,inx,
+ maria_read_vec[info->last_key_func])))
+ {
+ error=1;
+ my_errno=HA_ERR_END_OF_FILE;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!(info->update & HA_STATE_RNEXT_SAME))
+ {
+ /* First rnext_same; Store old key */
+ memcpy(info->lastkey2,info->lastkey,info->last_rkey_length);
+ }
+ for (;;)
+ {
+ if ((error= _ma_search_next(info,keyinfo,info->lastkey,
+ info->lastkey_length,SEARCH_BIGGER,
+ info->s->state.key_root[inx])))
+ break;
+ if (ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey,
+ (uchar*) info->lastkey2,
+ info->last_rkey_length, SEARCH_FIND, not_used))
+ {
+ error=1;
+ my_errno=HA_ERR_END_OF_FILE;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+ /* Skip rows that are inserted by other threads since we got a lock */
+ if (info->cur_row.lastpos < info->state->data_file_length)
+ break;
+ }
+ }
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->key_root_lock[inx]);
+ /* Don't clear if database-changed */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME;
+
+ if (error)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(my_errno);
+} /* maria_rnext_same */
diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c
new file mode 100644
index 00000000000..753ff604975
--- /dev/null
+++ b/storage/maria/ma_rprev.c
@@ -0,0 +1,88 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /*
+ Read previous row with the same key as previous read
+ One may have done a write, update or delete of the previous row.
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+ */
+
+int maria_rprev(MARIA_HA *info, uchar *buf, int inx)
+{
+ int error,changed;
+ register uint flag;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("maria_rprev");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+ flag=SEARCH_SMALLER; /* Read previous */
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+ info->update & HA_STATE_NEXT_FOUND)
+ flag=0; /* Read last */
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+ changed= _ma_test_if_changed(info);
+ if (share->concurrent_insert)
+ rw_rdlock(&share->key_root_lock[inx]);
+ if (!flag)
+ error= _ma_search_last(info, share->keyinfo+inx,
+ share->state.key_root[inx]);
+ else if (!changed)
+ error= _ma_search_next(info,share->keyinfo+inx,info->lastkey,
+ info->lastkey_length,flag,
+ share->state.key_root[inx]);
+ else
+ error= _ma_search(info,share->keyinfo+inx,info->lastkey,
+ USE_WHOLE_KEY, flag, share->state.key_root[inx]);
+
+ if (share->concurrent_insert)
+ {
+ if (!error)
+ {
+ while (info->cur_row.lastpos >= info->state->data_file_length)
+ {
+ /* Skip rows that are inserted by other threads since we got a lock */
+ if ((error= _ma_search_next(info,share->keyinfo+inx,info->lastkey,
+ info->lastkey_length,
+ SEARCH_SMALLER,
+ share->state.key_root[inx])))
+ break;
+ }
+ }
+ rw_unlock(&share->key_root_lock[inx]);
+ }
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_PREV_FOUND;
+ if (error)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(my_errno);
+} /* maria_rprev */
diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c
new file mode 100644
index 00000000000..24c4bfdd467
--- /dev/null
+++ b/storage/maria/ma_rrnd.c
@@ -0,0 +1,44 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read a record with random-access. The position to the record must
+ get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */
+
+
+#include "maria_def.h"
+
+/*
+ Read a row based on position.
+
+ RETURN
+ 0 Ok.
+ HA_ERR_RECORD_DELETED Record is deleted.
+ HA_ERR_END_OF_FILE EOF.
+*/
+
+int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ DBUG_ENTER("maria_rrnd");
+
+ DBUG_ASSERT(filepos != HA_OFFSET_ERROR);
+
+ /* Init all but update-flag */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+
+ info->cur_row.lastpos= filepos; /* Remember for update */
+ DBUG_RETURN((*info->s->read_record)(info, buf, filepos));
+}
diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c
new file mode 100644
index 00000000000..9c9acac013a
--- /dev/null
+++ b/storage/maria/ma_rsame.c
@@ -0,0 +1,69 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+/*
+ Find current row with read on position or read on key
+
+ NOTES
+ If inx >= 0 find record using key
+
+ RETURN
+ 0 Ok
+ HA_ERR_KEY_NOT_FOUND Row is deleted
+ HA_ERR_END_OF_FILE End of file
+*/
+
+
+int maria_rsame(MARIA_HA *info, uchar *record, int inx)
+{
+ DBUG_ENTER("maria_rsame");
+
+ if (inx != -1 && ! maria_is_key_active(info->s->state.key_map, inx))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ }
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR ||
+ info->update & HA_STATE_DELETED)
+ {
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No current record */
+ }
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ /* Read row from data file */
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+
+ if (inx >= 0)
+ {
+ info->lastinx=inx;
+ info->lastkey_length= _ma_make_key(info,(uint) inx,info->lastkey,record,
+ info->cur_row.lastpos);
+ if (info->s->concurrent_insert)
+ rw_rdlock(&info->s->key_root_lock[inx]);
+ VOID(_ma_search(info,info->s->keyinfo+inx,info->lastkey, USE_WHOLE_KEY,
+ SEARCH_SAME,
+ info->s->state.key_root[inx]));
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->key_root_lock[inx]);
+ }
+
+ if (!(*info->read_record)(info, record, info->cur_row.lastpos))
+ DBUG_RETURN(0);
+ if (my_errno == HA_ERR_RECORD_DELETED)
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ DBUG_RETURN(my_errno);
+} /* maria_rsame */
diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c
new file mode 100644
index 00000000000..186bc80c06d
--- /dev/null
+++ b/storage/maria/ma_rsamepos.c
@@ -0,0 +1,58 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* read record through position and fix key-position */
+/* As maria_rsame but supply a position */
+
+#include "maria_def.h"
+
+
+ /*
+ ** If inx >= 0 update index pointer
+ ** Returns one of the following values:
+ ** 0 = Ok.
+ ** HA_ERR_KEY_NOT_FOUND = Row is deleted
+ ** HA_ERR_END_OF_FILE = End of file
+ */
+
+int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx,
+ MARIA_RECORD_POS filepos)
+{
+ DBUG_ENTER("maria_rsame_with_pos");
+ DBUG_PRINT("enter",("index: %d filepos: %ld", inx, (long) filepos));
+
+ if (inx < -1 ||
+ (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx)))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ }
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if ((*info->s->read_record)(info, record, filepos))
+ {
+ if (my_errno == HA_ERR_RECORD_DELETED)
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ DBUG_RETURN(my_errno);
+ }
+ info->cur_row.lastpos= filepos;
+ info->lastinx= inx;
+ if (inx >= 0)
+ {
+ info->lastkey_length= _ma_make_key(info,(uint) inx,info->lastkey,record,
+ info->cur_row.lastpos);
+ info->update|=HA_STATE_KEY_CHANGED; /* Don't use indexposition */
+ }
+ DBUG_RETURN(0);
+} /* maria_rsame_pos */
diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c
new file mode 100644
index 00000000000..4980233fc11
--- /dev/null
+++ b/storage/maria/ma_rt_index.c
@@ -0,0 +1,1140 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+#define REINSERT_BUFFER_INC 10
+#define PICK_BY_AREA
+/*#define PICK_BY_PERIMETER*/
+
+typedef struct st_page_level
+{
+ uint level;
+ my_off_t offs;
+} stPageLevel;
+
+typedef struct st_page_list
+{
+ ulong n_pages;
+ ulong m_pages;
+ stPageLevel *pages;
+} stPageList;
+
+
+/*
+ Find next key in r-tree according to search_flag recursively
+
+ NOTES
+ Used in maria_rtree_find_first() and maria_rtree_find_next()
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uint search_flag,
+ uint nod_cmp_flag, my_off_t page, int level)
+{
+ uint nod_flag;
+ int res;
+ uchar *page_buf, *k, *last;
+ int k_len;
+ uint *saved_key = (uint*) (info->maria_rtree_recursion_state) + level;
+
+ if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length)))
+ {
+ my_errno = HA_ERR_OUT_OF_MEM;
+ return -1;
+ }
+ if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+
+ k_len = keyinfo->keylength - info->s->base.rec_reflength;
+
+ if(info->maria_rtree_recursion_depth >= level)
+ {
+ k= page_buf + *saved_key;
+ }
+ else
+ {
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ }
+ last= rt_PAGE_END(page_buf);
+
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag))
+ {
+ if (nod_flag)
+ {
+ /* this is an internal node in the tree */
+ if (!(res = maria_rtree_key_cmp(keyinfo->seg,
+ info->first_mbr_key, k,
+ info->last_rkey_length, nod_cmp_flag)))
+ {
+ switch ((res = maria_rtree_find_req(info, keyinfo, search_flag,
+ nod_cmp_flag,
+ _ma_kpos(nod_flag, k),
+ level + 1)))
+ {
+ case 0: /* found - exit from recursion */
+ *saved_key = k - page_buf;
+ goto ok;
+ case 1: /* not found - continue searching */
+ info->maria_rtree_recursion_depth = level;
+ break;
+ default: /* error */
+ case -1:
+ goto err1;
+ }
+ }
+ }
+ else
+ {
+ /* this is a leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key,
+ k, info->last_rkey_length, search_flag))
+ {
+ uchar *after_key = (uchar*) rt_PAGE_NEXT_KEY(k, k_len, nod_flag);
+ info->cur_row.lastpos = _ma_dpos(info, 0, after_key);
+ info->lastkey_length = k_len + info->s->base.rec_reflength;
+ memcpy(info->lastkey, k, info->lastkey_length);
+ info->maria_rtree_recursion_depth = level;
+ *saved_key = last - page_buf;
+
+ if (after_key < last)
+ {
+ info->int_keypos = info->buff;
+ info->int_maxpos = info->buff + (last - after_key);
+ memcpy(info->buff, after_key, last - after_key);
+ info->keyread_buff_used = 0;
+ }
+ else
+ {
+ info->keyread_buff_used = 1;
+ }
+
+ res = 0;
+ goto ok;
+ }
+ }
+ }
+ info->cur_row.lastpos = HA_OFFSET_ERROR;
+ my_errno = HA_ERR_KEY_NOT_FOUND;
+ res = 1;
+
+ok:
+ my_afree((uchar*)page_buf);
+ return res;
+
+err1:
+ my_afree((uchar*)page_buf);
+ info->cur_row.lastpos = HA_OFFSET_ERROR;
+ return -1;
+}
+
+
+/*
+ Find first key in r-tree according to search_flag condition
+
+ SYNOPSIS
+ maria_rtree_find_first()
+ info Handler to MARIA file
+ uint keynr Key number to use
+ key Key to search for
+ key_length Length of 'key'
+ search_flag Bitmap of flags how to do the search
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_find_first(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length, uint search_flag)
+{
+ my_off_t root;
+ uint nod_cmp_flag;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+
+ if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ /*
+ Save searched key, include data pointer.
+ The data pointer is required if the search_flag contains MBR_DATA.
+ (minimum bounding rectangle)
+ */
+ memcpy(info->first_mbr_key, key, keyinfo->keylength);
+ info->last_rkey_length = key_length;
+
+ info->maria_rtree_recursion_depth = -1;
+ info->keyread_buff_used = 1;
+
+ nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+ MBR_WITHIN : MBR_INTERSECT);
+ return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root,
+ 0);
+}
+
+
+/*
+ Find next key in r-tree according to search_flag condition
+
+ SYNOPSIS
+ maria_rtree_find_next()
+ info Handler to MARIA file
+ uint keynr Key number to use
+ search_flag Bitmap of flags how to do the search
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint search_flag)
+{
+ my_off_t root;
+ uint nod_cmp_flag;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+
+ if (info->update & HA_STATE_DELETED)
+ return maria_rtree_find_first(info, keynr, info->lastkey,
+ info->lastkey_length,
+ search_flag);
+
+ if (!info->keyread_buff_used)
+ {
+ uchar *key= info->int_keypos;
+
+ while (key < info->int_maxpos)
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg,
+ info->first_mbr_key, key,
+ info->last_rkey_length, search_flag))
+ {
+ uchar *after_key= key + keyinfo->keylength;
+
+ info->cur_row.lastpos= _ma_dpos(info, 0, after_key);
+ memcpy(info->lastkey, key, info->lastkey_length);
+
+ if (after_key < info->int_maxpos)
+ info->int_keypos= after_key;
+ else
+ info->keyread_buff_used= 1;
+ return 0;
+ }
+ key+= keyinfo->keylength;
+ }
+ }
+ if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ nod_cmp_flag = ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+ MBR_WITHIN : MBR_INTERSECT);
+ return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, 0);
+}
+
+
+/*
+ Get next key in r-tree recursively
+
+ NOTES
+ Used in maria_rtree_get_first() and maria_rtree_get_next()
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uint key_length,
+ my_off_t page, int level)
+{
+ uchar *page_buf, *last, *k;
+ uint nod_flag, k_len;
+ int res;
+ uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+
+ if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length)))
+ return -1;
+ if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+
+ k_len = keyinfo->keylength - info->s->base.rec_reflength;
+
+ if(info->maria_rtree_recursion_depth >= level)
+ {
+ k = page_buf + *saved_key;
+ if (!nod_flag)
+ {
+ /* Only leaf pages contain data references. */
+ /* Need to check next key with data reference. */
+ k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag);
+ }
+ }
+ else
+ {
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ }
+ last = rt_PAGE_END(page_buf);
+
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag))
+ {
+ if (nod_flag)
+ {
+ /* this is an internal node in the tree */
+ switch ((res = maria_rtree_get_req(info, keyinfo, key_length,
+ _ma_kpos(nod_flag, k), level + 1)))
+ {
+ case 0: /* found - exit from recursion */
+ *saved_key = k - page_buf;
+ goto ok;
+ case 1: /* not found - continue searching */
+ info->maria_rtree_recursion_depth = level;
+ break;
+ default:
+ case -1: /* error */
+ goto err1;
+ }
+ }
+ else
+ {
+ /* this is a leaf */
+ uchar *after_key = rt_PAGE_NEXT_KEY(k, k_len, nod_flag);
+ info->cur_row.lastpos = _ma_dpos(info, 0, after_key);
+ info->lastkey_length = k_len + info->s->base.rec_reflength;
+ memcpy(info->lastkey, k, info->lastkey_length);
+
+ info->maria_rtree_recursion_depth = level;
+ *saved_key = k - page_buf;
+
+ if (after_key < last)
+ {
+ info->int_keypos = (uchar*) saved_key;
+ memcpy(info->buff, page_buf, keyinfo->block_length);
+ info->int_maxpos = rt_PAGE_END(info->buff);
+ info->keyread_buff_used = 0;
+ }
+ else
+ {
+ info->keyread_buff_used = 1;
+ }
+
+ res = 0;
+ goto ok;
+ }
+ }
+ info->cur_row.lastpos = HA_OFFSET_ERROR;
+ my_errno = HA_ERR_KEY_NOT_FOUND;
+ res = 1;
+
+ok:
+ my_afree((uchar*)page_buf);
+ return res;
+
+err1:
+ my_afree((uchar*)page_buf);
+ info->cur_row.lastpos = HA_OFFSET_ERROR;
+ return -1;
+}
+
+
+/*
+ Get first key in r-tree
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length)
+{
+ my_off_t root;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+
+ if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ info->maria_rtree_recursion_depth = -1;
+ info->keyread_buff_used = 1;
+
+ return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0);
+}
+
+
+/*
+ Get next key in r-tree
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length)
+{
+ my_off_t root;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+
+ if (!info->keyread_buff_used)
+ {
+ uint k_len = keyinfo->keylength - info->s->base.rec_reflength;
+ /* rt_PAGE_NEXT_KEY(info->int_keypos) */
+ uchar *key = info->buff + *(int*)info->int_keypos + k_len +
+ info->s->base.rec_reflength;
+ /* rt_PAGE_NEXT_KEY(key) */
+ uchar *after_key = key + k_len + info->s->base.rec_reflength;
+
+ info->cur_row.lastpos = _ma_dpos(info, 0, after_key);
+ info->lastkey_length = k_len + info->s->base.rec_reflength;
+ memcpy(info->lastkey, key, k_len + info->s->base.rec_reflength);
+
+ *(int*)info->int_keypos = key - info->buff;
+ if (after_key >= info->int_maxpos)
+ {
+ info->keyread_buff_used = 1;
+ }
+
+ return 0;
+ }
+ else
+ {
+ if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0);
+ }
+}
+
+
+/*
+ Choose non-leaf better key for insertion
+*/
+
+#ifdef PICK_BY_PERIMETER
+static uchar *maria_rtree_pick_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key,
+ uint key_length, uchar *page_buf,
+ uint nod_flag)
+{
+ double increase;
+ double best_incr = DBL_MAX;
+ double perimeter;
+ double best_perimeter;
+ uchar *best_key;
+ uchar *k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ uchar *last = rt_PAGE_END(page_buf);
+
+ LINT_INIT(best_perimeter);
+ LINT_INIT(best_key);
+
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag))
+ {
+ if ((increase = maria_rtree_perimeter_increase(keyinfo->seg, k, key, key_length,
+ &perimeter)) == -1)
+ return NULL;
+ if ((increase < best_incr)||
+ (increase == best_incr && perimeter < best_perimeter))
+ {
+ best_key = k;
+ best_perimeter= perimeter;
+ best_incr = increase;
+ }
+ }
+ return best_key;
+}
+
+#endif /*PICK_BY_PERIMETER*/
+
+#ifdef PICK_BY_AREA
+static uchar *maria_rtree_pick_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key,
+ uint key_length, uchar *page_buf,
+ uint nod_flag)
+{
+ double increase;
+ double best_incr = DBL_MAX;
+ double area;
+ double best_area;
+ uchar *best_key;
+ uchar *k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ uchar *last = rt_PAGE_END(page_buf);
+
+ LINT_INIT(best_area);
+ LINT_INIT(best_key);
+
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag))
+ {
+ /* The following is safe as -1.0 is an exact number */
+ if ((increase = maria_rtree_area_increase(keyinfo->seg, k, key, key_length,
+ &area)) == -1.0)
+ return NULL;
+ /* The following should be safe, even if we compare doubles */
+ if (increase < best_incr)
+ {
+ best_key = k;
+ best_area = area;
+ best_incr = increase;
+ }
+ else
+ {
+ /* The following should be safe, even if we compare doubles */
+ if ((increase == best_incr) && (area < best_area))
+ {
+ best_key = k;
+ best_area = area;
+ best_incr = increase;
+ }
+ }
+ }
+ return best_key;
+}
+
+#endif /*PICK_BY_AREA*/
+
+/*
+ Go down and insert key into tree
+
+ RETURN
+ -1 Error
+ 0 Child was not split
+ 1 Child was split
+*/
+
+static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key,
+ uint key_length, my_off_t page,
+ my_off_t *new_page,
+ int ins_level, int level)
+{
+ uint nod_flag;
+ int res;
+ uchar *page_buf, *k;
+ DBUG_ENTER("maria_rtree_insert_req");
+
+ if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length +
+ HA_MAX_KEY_BUFF)))
+ {
+ my_errno = HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+ DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u",
+ (ulong) page, level, ins_level, nod_flag));
+
+ if ((ins_level == -1 && nod_flag) || /* key: go down to leaf */
+ (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */
+ {
+ if ((k = maria_rtree_pick_key(info, keyinfo, key, key_length, page_buf,
+ nod_flag)) == NULL)
+ goto err1;
+ switch ((res = maria_rtree_insert_req(info, keyinfo, key, key_length,
+ _ma_kpos(nod_flag, k), new_page,
+ ins_level, level + 1)))
+ {
+ case 0: /* child was not split */
+ {
+ maria_rtree_combine_rect(keyinfo->seg, k, key, k, key_length);
+ if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf))
+ goto err1;
+ goto ok;
+ }
+ case 1: /* child was split */
+ {
+ uchar *new_key = page_buf + keyinfo->block_length + nod_flag;
+ /* set proper MBR for key */
+ if (maria_rtree_set_key_mbr(info, keyinfo, k, key_length,
+ _ma_kpos(nod_flag, k)))
+ goto err1;
+ /* add new key for new page */
+ _ma_kpointer(info, new_key - nod_flag, *new_page);
+ if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length,
+ *new_page))
+ goto err1;
+ res = maria_rtree_add_key(info, keyinfo, new_key, key_length,
+ page_buf, new_page);
+ if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf))
+ goto err1;
+ goto ok;
+ }
+ default:
+ case -1: /* error */
+ {
+ goto err1;
+ }
+ }
+ }
+ else
+ {
+ res = maria_rtree_add_key(info, keyinfo, key, key_length, page_buf,
+ new_page);
+ if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf))
+ goto err1;
+ }
+
+ok:
+ my_afree(page_buf);
+ DBUG_RETURN(res);
+
+err1:
+ my_afree(page_buf);
+ DBUG_RETURN(-1); /* purecov: inspected */
+}
+
+
+/*
+ Insert key into the tree
+
+ RETURN
+ -1 Error
+ 0 Root was not split
+ 1 Root was split
+*/
+
+static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length, int ins_level)
+{
+ my_off_t old_root;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+ int res;
+ my_off_t new_page;
+ DBUG_ENTER("maria_rtree_insert_level");
+
+ if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ if ((old_root = _ma_new(info, keyinfo, DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(-1);
+ info->keyread_buff_used = 1;
+ maria_putint(info->buff, 2, 0);
+ res = maria_rtree_add_key(info, keyinfo, key, key_length, info->buff, NULL);
+ if (_ma_write_keypage(info, keyinfo, old_root, DFLT_INIT_HITS, info->buff))
+ DBUG_RETURN(1);
+ info->s->state.key_root[keynr] = old_root;
+ DBUG_RETURN(res);
+ }
+
+ switch ((res = maria_rtree_insert_req(info, keyinfo, key, key_length,
+ old_root, &new_page, ins_level, 0)))
+ {
+ case 0: /* root was not split */
+ {
+ break;
+ }
+ case 1: /* root was split, grow a new root */
+ {
+ uchar *new_root_buf, *new_key;
+ my_off_t new_root;
+ uint nod_flag = info->s->base.key_reflength;
+
+ DBUG_PRINT("rtree", ("root was split, grow a new root"));
+ if (!(new_root_buf= (uchar*) my_alloca((uint)keyinfo->block_length +
+ HA_MAX_KEY_BUFF)))
+ {
+ my_errno = HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+
+ maria_putint(new_root_buf, 2, nod_flag);
+ if ((new_root = _ma_new(info, keyinfo, DFLT_INIT_HITS)) ==
+ HA_OFFSET_ERROR)
+ goto err1;
+
+ new_key = new_root_buf + keyinfo->block_length + nod_flag;
+
+ _ma_kpointer(info, new_key - nod_flag, old_root);
+ if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length,
+ old_root))
+ goto err1;
+ if (maria_rtree_add_key(info, keyinfo, new_key, key_length, new_root_buf,
+ NULL)
+ == -1)
+ goto err1;
+ _ma_kpointer(info, new_key - nod_flag, new_page);
+ if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length,
+ new_page))
+ goto err1;
+ if (maria_rtree_add_key(info, keyinfo, new_key, key_length, new_root_buf,
+ NULL)
+ == -1)
+ goto err1;
+ if (_ma_write_keypage(info, keyinfo, new_root,
+ DFLT_INIT_HITS, new_root_buf))
+ goto err1;
+ info->s->state.key_root[keynr] = new_root;
+ DBUG_PRINT("rtree", ("new root page: %lu level: %d nod_flag: %u",
+ (ulong) new_root, 0,
+ _ma_test_if_nod(new_root_buf)));
+
+ my_afree((uchar*)new_root_buf);
+ break;
+err1:
+ my_afree((uchar*)new_root_buf);
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ default:
+ case -1: /* error */
+ {
+ break;
+ }
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Insert key into the tree - interface function
+
+ RETURN
+ -1 Error
+ 0 OK
+*/
+
+int maria_rtree_insert(MARIA_HA *info, uint keynr, uchar *key, uint key_length)
+{
+ DBUG_ENTER("maria_rtree_insert");
+ DBUG_RETURN((!key_length ||
+ (maria_rtree_insert_level(info, keynr, key, key_length, -1) == -1)) ?
+ -1 : 0);
+}
+
+
+/*
+ Fill reinsert page buffer
+
+ RETURN
+ -1 Error
+ 0 OK
+*/
+
+static int maria_rtree_fill_reinsert_list(stPageList *ReinsertList, my_off_t page,
+ int level)
+{
+ DBUG_ENTER("maria_rtree_fill_reinsert_list");
+ DBUG_PRINT("rtree", ("page: %lu level: %d", (ulong) page, level));
+ if (ReinsertList->n_pages == ReinsertList->m_pages)
+ {
+ ReinsertList->m_pages += REINSERT_BUFFER_INC;
+ if (!(ReinsertList->pages = (stPageLevel*)my_realloc((uchar*)ReinsertList->pages,
+ ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR))))
+ goto err1;
+ }
+ /* save page to ReinsertList */
+ ReinsertList->pages[ReinsertList->n_pages].offs = page;
+ ReinsertList->pages[ReinsertList->n_pages].level = level;
+ ReinsertList->n_pages++;
+ DBUG_RETURN(0);
+
+err1:
+ DBUG_RETURN(-1); /* purecov: inspected */
+}
+
+
+/*
+ Go down and delete key from the tree
+
+ RETURN
+ -1 Error
+ 0 Deleted
+ 1 Not found
+ 2 Empty leaf
+*/
+
+static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key,
+ uint key_length, my_off_t page,
+ uint *page_size,
+ stPageList *ReinsertList, int level)
+{
+ ulong i;
+ uint nod_flag;
+ int res;
+ uchar *page_buf, *last, *k;
+ DBUG_ENTER("maria_rtree_delete_req");
+
+ if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length)))
+ {
+ my_errno = HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+ DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u",
+ (ulong) page, level, nod_flag));
+
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ last = rt_PAGE_END(page_buf);
+
+ for (i = 0; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag), i++)
+ {
+ if (nod_flag)
+ {
+ /* not leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, MBR_WITHIN))
+ {
+ switch ((res = maria_rtree_delete_req(info, keyinfo, key, key_length,
+ _ma_kpos(nod_flag, k), page_size, ReinsertList, level + 1)))
+ {
+ case 0: /* deleted */
+ {
+ /* test page filling */
+ if (*page_size + key_length >=
+ rt_PAGE_MIN_SIZE(keyinfo->block_length))
+ {
+ /* OK */
+ /* Calculate a new key value (MBR) for the shrinked block. */
+ if (maria_rtree_set_key_mbr(info, keyinfo, k, key_length,
+ _ma_kpos(nod_flag, k)))
+ goto err1;
+ if (_ma_write_keypage(info, keyinfo, page,
+ DFLT_INIT_HITS, page_buf))
+ goto err1;
+ }
+ else
+ {
+ /*
+ Too small: delete key & add it descendant to reinsert list.
+ Store position and level of the block so that it can be
+ accessed later for inserting the remaining keys.
+ */
+ DBUG_PRINT("rtree", ("too small. move block to reinsert list"));
+ if (maria_rtree_fill_reinsert_list(ReinsertList,
+ _ma_kpos(nod_flag, k),
+ level + 1))
+ goto err1;
+ /*
+ Delete the key that references the block. This makes the
+ block disappear from the index. Hence we need to insert
+ its remaining keys later. Note: if the block is a branch
+ block, we do not only remove this block, but the whole
+ subtree. So we need to re-insert its keys on the same
+ level later to reintegrate the subtrees.
+ */
+ maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag);
+ if (_ma_write_keypage(info, keyinfo, page,
+ DFLT_INIT_HITS, page_buf))
+ goto err1;
+ *page_size = maria_data_on_page(page_buf);
+ }
+
+ goto ok;
+ }
+ case 1: /* not found - continue searching */
+ {
+ break;
+ }
+ case 2: /* vacuous case: last key in the leaf */
+ {
+ maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag);
+ if (_ma_write_keypage(info, keyinfo, page,
+ DFLT_INIT_HITS, page_buf))
+ goto err1;
+ *page_size = maria_data_on_page(page_buf);
+ res = 0;
+ goto ok;
+ }
+ default: /* error */
+ case -1:
+ {
+ goto err1;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, MBR_EQUAL | MBR_DATA))
+ {
+ maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag);
+ *page_size = maria_data_on_page(page_buf);
+ if (*page_size == 2)
+ {
+ /* last key in the leaf */
+ res = 2;
+ if (_ma_dispose(info, keyinfo, page, DFLT_INIT_HITS))
+ goto err1;
+ }
+ else
+ {
+ res = 0;
+ if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf))
+ goto err1;
+ }
+ goto ok;
+ }
+ }
+ }
+ res = 1;
+
+ok:
+ my_afree((uchar*)page_buf);
+ DBUG_RETURN(res);
+
+err1:
+ my_afree((uchar*)page_buf);
+ DBUG_RETURN(-1); /* purecov: inspected */
+}
+
+
+/*
+ Delete key - interface function
+
+ RETURN
+ -1 Error
+ 0 Deleted
+*/
+
+int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key, uint key_length)
+{
+ uint page_size;
+ stPageList ReinsertList;
+ my_off_t old_root;
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+ DBUG_ENTER("maria_rtree_delete");
+
+ if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ DBUG_PRINT("rtree", ("starting deletion at root page: %lu",
+ (ulong) old_root));
+
+ ReinsertList.pages = NULL;
+ ReinsertList.n_pages = 0;
+ ReinsertList.m_pages = 0;
+
+ switch (maria_rtree_delete_req(info, keyinfo, key, key_length, old_root,
+ &page_size, &ReinsertList, 0))
+ {
+ case 2: /* empty */
+ {
+ info->s->state.key_root[keynr] = HA_OFFSET_ERROR;
+ DBUG_RETURN(0);
+ }
+ case 0: /* deleted */
+ {
+ uint nod_flag;
+ ulong i;
+ for (i = 0; i < ReinsertList.n_pages; ++i)
+ {
+ uchar *page_buf, *k, *last;
+
+ if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length)))
+ {
+ my_errno = HA_ERR_OUT_OF_MEM;
+ goto err1;
+ }
+ if (!_ma_fetch_keypage(info, keyinfo, ReinsertList.pages[i].offs,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+ DBUG_PRINT("rtree", ("reinserting keys from "
+ "page: %lu level: %d nod_flag: %u",
+ (ulong) ReinsertList.pages[i].offs,
+ ReinsertList.pages[i].level, nod_flag));
+
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ last = rt_PAGE_END(page_buf);
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag))
+ {
+ int res;
+ if ((res=
+ maria_rtree_insert_level(info, keynr, k, key_length,
+ ReinsertList.pages[i].level)) == -1)
+ {
+ my_afree(page_buf);
+ goto err1;
+ }
+ if (res)
+ {
+ ulong j;
+ DBUG_PRINT("rtree", ("root has been split, adjust levels"));
+ for (j= i; j < ReinsertList.n_pages; j++)
+ {
+ ReinsertList.pages[j].level++;
+ DBUG_PRINT("rtree", ("keys from page: %lu now level: %d",
+ (ulong) ReinsertList.pages[i].offs,
+ ReinsertList.pages[i].level));
+ }
+ }
+ }
+ my_afree(page_buf);
+ if (_ma_dispose(info, keyinfo, ReinsertList.pages[i].offs,
+ DFLT_INIT_HITS))
+ goto err1;
+ }
+ if (ReinsertList.pages)
+ my_free((uchar*) ReinsertList.pages, MYF(0));
+
+ /* check for redundant root (not leaf, 1 child) and eliminate */
+ if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ goto err1;
+ if (!_ma_fetch_keypage(info, keyinfo, old_root, DFLT_INIT_HITS,
+ info->buff, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(info->buff);
+ page_size = maria_data_on_page(info->buff);
+ if (nod_flag && (page_size == 2 + key_length + nod_flag))
+ {
+ my_off_t new_root = _ma_kpos(nod_flag,
+ rt_PAGE_FIRST_KEY(info->buff, nod_flag));
+ if (_ma_dispose(info, keyinfo, old_root, DFLT_INIT_HITS))
+ goto err1;
+ info->s->state.key_root[keynr] = new_root;
+ }
+ info->update= HA_STATE_DELETED;
+ DBUG_RETURN(0);
+
+err1:
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ case 1: /* not found */
+ {
+ my_errno = HA_ERR_KEY_NOT_FOUND;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ default:
+ case -1: /* error */
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+}
+
+
+/*
+ Estimate number of suitable keys in the tree
+
+ RETURN
+ estimated value
+*/
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length, uint flag)
+{
+ MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr;
+ my_off_t root;
+ uint i = 0;
+ uint nod_flag, k_len;
+ uchar *page_buf, *k, *last;
+ double area = 0;
+ ha_rows res = 0;
+
+ if (flag & MBR_DISJOINT)
+ return info->state->records;
+
+ if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ return HA_POS_ERROR;
+ if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length)))
+ return HA_POS_ERROR;
+ if (!_ma_fetch_keypage(info, keyinfo, root, DFLT_INIT_HITS, page_buf, 0))
+ goto err1;
+ nod_flag = _ma_test_if_nod(page_buf);
+
+ k_len = keyinfo->keylength - info->s->base.rec_reflength;
+
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+ last = rt_PAGE_END(page_buf);
+
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag), i++)
+ {
+ if (nod_flag)
+ {
+ double k_area = maria_rtree_rect_volume(keyinfo->seg, k, key_length);
+
+ /* The following should be safe, even if we compare doubles */
+ if (k_area == 0)
+ {
+ if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+ {
+ area += 1;
+ }
+ else if (flag & (MBR_WITHIN | MBR_EQUAL))
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length,
+ MBR_WITHIN))
+ area += 1;
+ }
+ else
+ goto err1;
+ }
+ else
+ {
+ if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+ {
+ area+= maria_rtree_overlapping_area(keyinfo->seg, key, k,
+ key_length) / k_area;
+ }
+ else if (flag & (MBR_WITHIN | MBR_EQUAL))
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length,
+ MBR_WITHIN))
+ area+= (maria_rtree_rect_volume(keyinfo->seg, key, key_length) /
+ k_area);
+ }
+ else
+ goto err1;
+ }
+ }
+ else
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, flag))
+ ++res;
+ }
+ }
+ if (nod_flag)
+ {
+ if (i)
+ res = (ha_rows) (area / i * info->state->records);
+ else
+ res = HA_POS_ERROR;
+ }
+
+ my_afree((uchar*)page_buf);
+ return res;
+
+err1:
+ my_afree(page_buf);
+ return HA_POS_ERROR;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h
new file mode 100644
index 00000000000..fe2f62b662c
--- /dev/null
+++ b/storage/maria/ma_rt_index.h
@@ -0,0 +1,49 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _rt_index_h
+#define _rt_index_h
+
+#ifdef HAVE_RTREE_KEYS
+
+#define rt_PAGE_FIRST_KEY(page, nod_flag) (page + 2 + nod_flag)
+#define rt_PAGE_NEXT_KEY(key, key_length, nod_flag) (key + key_length + \
+ (nod_flag ? nod_flag : info->s->base.rec_reflength))
+#define rt_PAGE_END(page) (page + maria_data_on_page(page))
+
+#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length) / 3)
+
+int maria_rtree_insert(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length);
+int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length);
+
+int maria_rtree_find_first(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length, uint search_flag);
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint search_flag);
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length);
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length);
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length, uint flag);
+
+int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint key_length,
+ my_off_t *new_page_offs);
+
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_index_h */
diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c
new file mode 100644
index 00000000000..b74d5d06690
--- /dev/null
+++ b/storage/maria/ma_rt_key.c
@@ -0,0 +1,109 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+/*
+ Add key to the page
+
+ RESULT VALUES
+ -1 Error
+ 0 Not split
+ 1 Split
+*/
+
+int maria_rtree_add_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uint key_length, uchar *page_buf, my_off_t *new_page)
+{
+ uint page_size = maria_data_on_page(page_buf);
+ uint nod_flag = _ma_test_if_nod(page_buf);
+ DBUG_ENTER("maria_rtree_add_key");
+
+ if (page_size + key_length + info->s->base.rec_reflength <=
+ keyinfo->block_length)
+ {
+ /* split won't be necessary */
+ if (nod_flag)
+ {
+ /* save key */
+ DBUG_ASSERT(_ma_kpos(nod_flag, key) < info->state->key_file_length);
+ memcpy(rt_PAGE_END(page_buf), key - nod_flag, key_length + nod_flag);
+ page_size += key_length + nod_flag;
+ }
+ else
+ {
+ /* save key */
+ DBUG_ASSERT(_ma_dpos(info, nod_flag, key + key_length +
+ info->s->base.rec_reflength) <
+ info->state->data_file_length +
+ info->s->base.pack_reclength);
+ memcpy(rt_PAGE_END(page_buf), key, key_length +
+ info->s->base.rec_reflength);
+ page_size += key_length + info->s->base.rec_reflength;
+ }
+ maria_putint(page_buf, page_size, nod_flag);
+ DBUG_RETURN(0);
+ }
+
+ DBUG_RETURN(maria_rtree_split_page(info, keyinfo, page_buf, key, key_length,
+ new_page) ? -1 : 1);
+}
+
+
+/*
+ Delete key from the page
+*/
+
+int maria_rtree_delete_key(MARIA_HA *info, uchar *page_buf, uchar *key,
+ uint key_length, uint nod_flag)
+{
+ uint16 page_size = maria_data_on_page(page_buf);
+ uchar *key_start;
+
+ key_start= key - nod_flag;
+ if (!nod_flag)
+ key_length += info->s->base.rec_reflength;
+
+ memmove(key_start, key + key_length, page_size - key_length -
+ (key - page_buf));
+ page_size-= key_length + nod_flag;
+
+ maria_putint(page_buf, page_size, nod_flag);
+ return 0;
+}
+
+
+/*
+ Calculate and store key MBR
+*/
+
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uint key_length, my_off_t child_page)
+{
+ DBUG_ENTER("maria_rtree_set_key_mbr");
+ if (!_ma_fetch_keypage(info, keyinfo, child_page,
+ DFLT_INIT_HITS, info->buff, 0))
+ DBUG_RETURN(-1);
+
+ DBUG_RETURN(maria_rtree_page_mbr(info, keyinfo->seg,
+ info->buff, key, key_length));
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h
new file mode 100644
index 00000000000..3f95d3d3e67
--- /dev/null
+++ b/storage/maria/ma_rt_key.h
@@ -0,0 +1,32 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Ramil Kalimullin, who has a shared copyright to this code */
+
+#ifndef _rt_key_h
+#define _rt_key_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_add_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uint key_length, uchar *page_buf, my_off_t *new_page);
+int maria_rtree_delete_key(MARIA_HA *info, uchar *page, uchar *key,
+ uint key_length, uint nod_flag);
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uint key_length, my_off_t child_page);
+
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_key_h */
diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c
new file mode 100644
index 00000000000..a224cefac12
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.c
@@ -0,0 +1,806 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_mbr.h"
+
+#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin > bmax) || (bmin > amax))
+#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin) || (bmax < amax))
+#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin) || (amax < bmax))
+#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax))
+#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax))
+
+#define FCMP(A, B) ((int)(A) - (int)(B))
+#define p_inc(A, B, X) {A += X; B += X;}
+
+#define RT_CMP(nextflag) \
+ if (nextflag & MBR_INTERSECT) \
+ { \
+ if (INTERSECT_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_CONTAIN) \
+ { \
+ if (CONTAIN_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_WITHIN) \
+ { \
+ if (WITHIN_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_EQUAL) \
+ { \
+ if (EQUAL_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_DISJOINT) \
+ { \
+ if (DISJOINT_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ }\
+ else /* if unknown comparison operator */ \
+ { \
+ DBUG_ASSERT(0); \
+ }
+
+#define RT_CMP_KORR(type, korr_func, len, nextflag) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(a); \
+ bmin = korr_func(b); \
+ amax = korr_func(a+len); \
+ bmax = korr_func(b+len); \
+ RT_CMP(nextflag); \
+}
+
+#define RT_CMP_GET(type, get_func, len, nextflag) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ RT_CMP(nextflag); \
+}
+
+/*
+ Compares two keys a and b depending on nextflag
+ nextflag can contain these flags:
+ MBR_INTERSECT(a,b) a overlaps b
+ MBR_CONTAIN(a,b) a contains b
+ MBR_DISJOINT(a,b) a disjoint b
+ MBR_WITHIN(a,b) a within b
+ MBR_EQUAL(a,b) All coordinates of MBRs are equal
+ MBR_DATA(a,b) Data reference is the same
+ Returns 0 on success.
+*/
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, uchar *b, uchar *a, uint key_length,
+ uint nextflag)
+{
+ for (; (int) key_length > 0; keyseg += 2 )
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag)
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag)
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ /* The following should be safe, even if we compare doubles */
+ RT_CMP_GET(float, mi_float4get, 4, nextflag);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_CMP_GET(double, mi_float8get, 8, nextflag);
+ break;
+ case HA_KEYTYPE_END:
+ goto end;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+
+end:
+ if (nextflag & MBR_DATA)
+ {
+ uchar *end = a + keyseg->length;
+ do
+ {
+ if (*a++ != *b++)
+ return FCMP(a[-1], b[-1]);
+ } while (a != end);
+ }
+ return 0;
+}
+
+#define RT_VOL_KORR(type, korr_func, len, cast) \
+{ \
+ type amin, amax; \
+ amin = korr_func(a); \
+ amax = korr_func(a+len); \
+ res *= (cast(amax) - cast(amin)); \
+}
+
+#define RT_VOL_GET(type, get_func, len, cast) \
+{ \
+ type amin, amax; \
+ get_func(amin, a); \
+ get_func(amax, a+len); \
+ res *= (cast(amax) - cast(amin)); \
+}
+
+/*
+ Calculates rectangle volume
+*/
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length)
+{
+ double res = 1;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_VOL_KORR(int8, mi_sint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_VOL_KORR(uint8, mi_uint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_VOL_KORR(int16, mi_sint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_VOL_KORR(uint16, mi_uint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_VOL_KORR(int32, mi_sint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_VOL_KORR(uint32, mi_uint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_VOL_KORR(int32, mi_sint4korr, 4, (double));
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_VOL_KORR(uint32, mi_uint4korr, 4, (double));
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_VOL_KORR(longlong, mi_sint8korr, 8, (double));
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_VOL_GET(float, mi_float4get, 4, (double));
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_VOL_GET(double, mi_float8get, 8, (double));
+ break;
+ case HA_KEYTYPE_END:
+ key_length = 0;
+ break;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ }
+ return res;
+}
+
+#define RT_D_MBR_KORR(type, korr_func, len, cast) \
+{ \
+ type amin, amax; \
+ amin = korr_func(a); \
+ amax = korr_func(a+len); \
+ *res++ = cast(amin); \
+ *res++ = cast(amax); \
+}
+
+#define RT_D_MBR_GET(type, get_func, len, cast) \
+{ \
+ type amin, amax; \
+ get_func(amin, a); \
+ get_func(amax, a+len); \
+ *res++ = cast(amin); \
+ *res++ = cast(amax); \
+}
+
+
+/*
+ Creates an MBR as an array of doubles.
+*/
+
+int maria_rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length, double *res)
+{
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double));
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double));
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double));
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_D_MBR_GET(float, mi_float4get, 4, (double));
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_D_MBR_GET(double, mi_float8get, 8, (double));
+ break;
+ case HA_KEYTYPE_END:
+ key_length = 0;
+ break;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ }
+ return 0;
+}
+
+#define RT_COMB_KORR(type, korr_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(a); \
+ bmin = korr_func(b); \
+ amax = korr_func(a+len); \
+ bmax = korr_func(b+len); \
+ amin = min(amin, bmin); \
+ amax = max(amax, bmax); \
+ store_func(c, amin); \
+ store_func(c+len, amax); \
+}
+
+#define RT_COMB_GET(type, get_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ amin = min(amin, bmin); \
+ amax = max(amax, bmax); \
+ store_func(c, amin); \
+ store_func(c+len, amax); \
+}
+
+/*
+ Creates common minimal bounding rectungle
+ for two input rectagnles a and b
+ Result is written to c
+*/
+
+int maria_rtree_combine_rect(HA_KEYSEG *keyseg, uchar* a, uchar* b, uchar* c,
+ uint key_length)
+{
+ for ( ; (int) key_length > 0 ; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_COMB_GET(float, mi_float4get, mi_float4store, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_COMB_GET(double, mi_float8get, mi_float8store, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return 0;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ c+= keyseg_length;
+ }
+ return 0;
+}
+
+
+#define RT_OVL_AREA_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(a); \
+ bmin = korr_func(b); \
+ amax = korr_func(a+len); \
+ bmax = korr_func(b+len); \
+ amin = max(amin, bmin); \
+ amax = min(amax, bmax); \
+ if (amin >= amax) \
+ return 0; \
+ res *= amax - amin; \
+}
+
+#define RT_OVL_AREA_GET(type, get_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ amin = max(amin, bmin); \
+ amax = min(amax, bmax); \
+ if (amin >= amax) \
+ return 0; \
+ res *= amax - amin; \
+}
+
+/*
+Calculates overlapping area of two MBRs a & b
+*/
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length)
+{
+ double res = 1;
+ for (; (int) key_length > 0 ; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_OVL_AREA_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_OVL_AREA_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_OVL_AREA_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_OVL_AREA_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_OVL_AREA_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_OVL_AREA_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return res;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+ return res;
+}
+
+#define RT_AREA_INC_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(a); \
+ bmin = korr_func(b); \
+ amax = korr_func(a+len); \
+ bmax = korr_func(b+len); \
+ a_area *= (((double)amax) - ((double)amin)); \
+ loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_AREA_INC_GET(type, get_func, len)\
+{\
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ a_area *= (((double)amax) - ((double)amin)); \
+ loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+ Calculates MBR_AREA(a+b) - MBR_AREA(a)
+*/
+
+double maria_rtree_area_increase(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+ uint key_length, double *ab_area)
+{
+ double a_area= 1.0;
+ double loc_ab_area= 1.0;
+
+ *ab_area= 1.0;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+
+ if (keyseg->null_bit) /* Handle NULL part */
+ return -1;
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_AREA_INC_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_AREA_INC_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_AREA_INC_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_AREA_INC_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_AREA_INC_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_AREA_INC_KORR(int32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_AREA_INC_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_AREA_INC_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_AREA_INC_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_AREA_INC_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ goto safe_end;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+safe_end:
+ *ab_area= loc_ab_area;
+ return loc_ab_area - a_area;
+}
+
+#define RT_PERIM_INC_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(a); \
+ bmin = korr_func(b); \
+ amax = korr_func(a+len); \
+ bmax = korr_func(b+len); \
+ a_perim+= (((double)amax) - ((double)amin)); \
+ *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_PERIM_INC_GET(type, get_func, len)\
+{\
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ a_perim+= (((double)amax) - ((double)amin)); \
+ *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a)
+*/
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length, double *ab_perim)
+{
+ double a_perim = 0.0;
+
+ *ab_perim= 0.0;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+
+ if (keyseg->null_bit) /* Handle NULL part */
+ return -1;
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_PERIM_INC_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_PERIM_INC_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_PERIM_INC_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_PERIM_INC_KORR(int32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_PERIM_INC_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_PERIM_INC_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_PERIM_INC_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return *ab_perim - a_perim;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+ return *ab_perim - a_perim;
+}
+
+
+#define RT_PAGE_MBR_KORR(type, korr_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin = korr_func(k + inc); \
+ amax = korr_func(k + inc + len); \
+ k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); \
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) \
+{ \
+ bmin = korr_func(k + inc); \
+ bmax = korr_func(k + inc + len); \
+ if (amin > bmin) \
+ amin = bmin; \
+ if (amax < bmax) \
+ amax = bmax; \
+} \
+ store_func(c, amin); \
+ c += len; \
+ store_func(c, amax); \
+ c += len; \
+ inc += 2 * len; \
+}
+
+#define RT_PAGE_MBR_GET(type, get_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, k + inc); \
+ get_func(amax, k + inc + len); \
+ k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); \
+ for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) \
+{ \
+ get_func(bmin, k + inc); \
+ get_func(bmax, k + inc + len); \
+ if (amin > bmin) \
+ amin = bmin; \
+ if (amax < bmax) \
+ amax = bmax; \
+} \
+ store_func(c, amin); \
+ c += len; \
+ store_func(c, amax); \
+ c += len; \
+ inc += 2 * len; \
+}
+
+/*
+ Calculates key page total MBR = MBR(key1) + MBR(key2) + ...
+*/
+int maria_rtree_page_mbr(MARIA_HA *info, HA_KEYSEG *keyseg, uchar *page_buf,
+ uchar *c, uint key_length)
+{
+ uint inc = 0;
+ uint k_len = key_length;
+ uint nod_flag = _ma_test_if_nod(page_buf);
+ uchar *k;
+ uchar *last = rt_PAGE_END(page_buf);
+
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ key_length -= keyseg->length * 2;
+
+ /* Handle NULL part */
+ if (keyseg->null_bit)
+ {
+ return 1;
+ }
+
+ k = rt_PAGE_FIRST_KEY(page_buf, nod_flag);
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_PAGE_MBR_KORR(int8, mi_sint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_PAGE_MBR_KORR(uint8, mi_uint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_PAGE_MBR_KORR(int16, mi_sint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_PAGE_MBR_KORR(uint16, mi_uint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_PAGE_MBR_KORR(int32, mi_sint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_PAGE_MBR_KORR(uint32, mi_uint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_PAGE_MBR_KORR(int32, mi_sint4korr, mi_int4store, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_PAGE_MBR_KORR(uint32, mi_uint4korr, mi_int4store, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_PAGE_MBR_KORR(longlong, mi_sint8korr, mi_int8store, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_PAGE_MBR_KORR(ulonglong, mi_uint8korr, mi_int8store, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_PAGE_MBR_GET(float, mi_float4get, mi_float4store, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_PAGE_MBR_GET(double, mi_float8get, mi_float8store, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ return 0;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h
new file mode 100644
index 00000000000..ad855518e62
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.h
@@ -0,0 +1,38 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _rt_mbr_h
+#define _rt_mbr_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, uchar *a, uchar *b, uint key_length,
+ uint nextflag);
+int maria_rtree_combine_rect(HA_KEYSEG *keyseg,uchar *, uchar *, uchar*,
+ uint key_length);
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length);
+int maria_rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length,
+ double *res);
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+ uint key_length);
+double maria_rtree_area_increase(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+ uint key_length, double *ab_area);
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length, double *ab_perim);
+int maria_rtree_page_mbr(MARIA_HA *info, HA_KEYSEG *keyseg, uchar *page_buf,
+ uchar* c, uint key_length);
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_mbr_h */
diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c
new file mode 100644
index 00000000000..a91eaa47bea
--- /dev/null
+++ b/storage/maria/ma_rt_split.c
@@ -0,0 +1,362 @@
+/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+typedef struct
+{
+ double square;
+ int n_node;
+ uchar *key;
+ double *coords;
+} SplitStruct;
+
+inline static double *reserve_coords(double **d_buffer, int n_dim)
+{
+ double *coords = *d_buffer;
+ (*d_buffer) += n_dim * 2;
+ return coords;
+}
+
+static void mbr_join(double *a, const double *b, int n_dim)
+{
+ double *end = a + n_dim * 2;
+ do
+ {
+ if (a[0] > b[0])
+ a[0] = b[0];
+
+ if (a[1] < b[1])
+ a[1] = b[1];
+
+ a += 2;
+ b += 2;
+ }while (a != end);
+}
+
+/*
+Counts the square of mbr which is a join of a and b
+*/
+static double mbr_join_square(const double *a, const double *b, int n_dim)
+{
+ const double *end = a + n_dim * 2;
+ double square = 1.0;
+ do
+ {
+ square *=
+ ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]);
+
+ a += 2;
+ b += 2;
+ }while (a != end);
+
+ return square;
+}
+
+static double count_square(const double *a, int n_dim)
+{
+ const double *end = a + n_dim * 2;
+ double square = 1.0;
+ do
+ {
+ square *= a[1] - a[0];
+ a += 2;
+ }while (a != end);
+ return square;
+}
+
+inline static void copy_coords(double *dst, const double *src, int n_dim)
+{
+ memcpy(dst, src, sizeof(double) * (n_dim * 2));
+}
+
+/*
+Select two nodes to collect group upon
+*/
+static void pick_seeds(SplitStruct *node, int n_entries,
+ SplitStruct **seed_a, SplitStruct **seed_b, int n_dim)
+{
+ SplitStruct *cur1;
+ SplitStruct *lim1 = node + (n_entries - 1);
+ SplitStruct *cur2;
+ SplitStruct *lim2 = node + n_entries;
+
+ double max_d = -DBL_MAX;
+ double d;
+
+ for (cur1 = node; cur1 < lim1; ++cur1)
+ {
+ for (cur2=cur1 + 1; cur2 < lim2; ++cur2)
+ {
+
+ d = mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square -
+ cur2->square;
+ if (d > max_d)
+ {
+ max_d = d;
+ *seed_a = cur1;
+ *seed_b = cur2;
+ }
+ }
+ }
+}
+
+/*
+Select next node and group where to add
+*/
+static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2,
+ SplitStruct **choice, int *n_group, int n_dim)
+{
+ SplitStruct *cur = node;
+ SplitStruct *end = node + n_entries;
+
+ double max_diff = -DBL_MAX;
+
+ for (; cur<end; ++cur)
+ {
+ double diff;
+ double abs_diff;
+
+ if (cur->n_node)
+ {
+ continue;
+ }
+
+ diff = mbr_join_square(g1, cur->coords, n_dim) -
+ mbr_join_square(g2, cur->coords, n_dim);
+
+ abs_diff = fabs(diff);
+ if (abs_diff > max_diff)
+ {
+ max_diff = abs_diff;
+ *n_group = 1 + (diff > 0);
+ *choice = cur;
+ }
+ }
+}
+
+/*
+Mark not-in-group entries as n_group
+*/
+static void mark_all_entries(SplitStruct *node, int n_entries, int n_group)
+{
+ SplitStruct *cur = node;
+ SplitStruct *end = node + n_entries;
+ for (; cur<end; ++cur)
+ {
+ if (cur->n_node)
+ {
+ continue;
+ }
+ cur->n_node = n_group;
+ }
+}
+
+static int split_maria_rtree_node(SplitStruct *node, int n_entries,
+ int all_size, /* Total key's size */
+ int key_size,
+ int min_size, /* Minimal group size */
+ int size1, int size2 /* initial group sizes */,
+ double **d_buffer, int n_dim)
+{
+ SplitStruct *cur;
+ SplitStruct *a;
+ SplitStruct *b;
+ double *g1 = reserve_coords(d_buffer, n_dim);
+ double *g2 = reserve_coords(d_buffer, n_dim);
+ SplitStruct *next;
+ int next_node;
+ int i;
+ SplitStruct *end = node + n_entries;
+ LINT_INIT(a);
+ LINT_INIT(b);
+ LINT_INIT(next);
+ LINT_INIT(next_node);
+
+ if (all_size < min_size * 2)
+ {
+ return 1;
+ }
+
+ cur = node;
+ for (; cur<end; ++cur)
+ {
+ cur->square = count_square(cur->coords, n_dim);
+ cur->n_node = 0;
+ }
+
+ pick_seeds(node, n_entries, &a, &b, n_dim);
+ a->n_node = 1;
+ b->n_node = 2;
+
+
+ copy_coords(g1, a->coords, n_dim);
+ size1 += key_size;
+ copy_coords(g2, b->coords, n_dim);
+ size2 += key_size;
+
+
+ for (i=n_entries - 2; i>0; --i)
+ {
+ if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */
+ {
+ mark_all_entries(node, n_entries, 1);
+ break;
+ }
+
+ if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */
+ {
+ mark_all_entries(node, n_entries, 2);
+ break;
+ }
+
+ pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+ if (next_node == 1)
+ {
+ size1 += key_size;
+ mbr_join(g1, next->coords, n_dim);
+ }
+ else
+ {
+ size2 += key_size;
+ mbr_join(g2, next->coords, n_dim);
+ }
+ next->n_node = next_node;
+ }
+
+ return 0;
+}
+
+int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key,
+ uint key_length, my_off_t *new_page_offs)
+{
+ int n1, n2; /* Number of items in groups */
+
+ SplitStruct *task;
+ SplitStruct *cur;
+ SplitStruct *stop;
+ double *coord_buf;
+ double *next_coord;
+ double *old_coord;
+ int n_dim;
+ uchar *source_cur, *cur1, *cur2;
+ uchar *new_page;
+ int err_code= 0;
+ uint nod_flag= _ma_test_if_nod(page);
+ uint full_length= key_length + (nod_flag ? nod_flag :
+ info->s->base.rec_reflength);
+ int max_keys= (maria_data_on_page(page)-2) / (full_length);
+ DBUG_ENTER("maria_rtree_split_page");
+ DBUG_PRINT("rtree", ("splitting block"));
+
+ n_dim = keyinfo->keysegs / 2;
+
+ if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) *
+ (max_keys + 1 + 4) +
+ sizeof(SplitStruct) * (max_keys + 1))))
+ DBUG_RETURN(-1); /* purecov: inspected */
+
+ task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4));
+
+ next_coord = coord_buf;
+
+ stop = task + max_keys;
+ source_cur = rt_PAGE_FIRST_KEY(page, nod_flag);
+
+ for (cur = task; cur < stop; ++cur, source_cur = rt_PAGE_NEXT_KEY(source_cur,
+ key_length, nod_flag))
+ {
+ cur->coords = reserve_coords(&next_coord, n_dim);
+ cur->key = source_cur;
+ maria_rtree_d_mbr(keyinfo->seg, source_cur, key_length, cur->coords);
+ }
+
+ cur->coords = reserve_coords(&next_coord, n_dim);
+ maria_rtree_d_mbr(keyinfo->seg, key, key_length, cur->coords);
+ cur->key = key;
+
+ old_coord = next_coord;
+
+ if (split_maria_rtree_node(task, max_keys + 1,
+ maria_data_on_page(page) + full_length + 2, full_length,
+ rt_PAGE_MIN_SIZE(keyinfo->block_length),
+ 2, 2, &next_coord, n_dim))
+ {
+ err_code = 1;
+ goto split_err;
+ }
+
+ if (!(new_page = (uchar*) my_alloca((uint)keyinfo->block_length)))
+ {
+ err_code= -1;
+ goto split_err;
+ }
+
+ stop = task + (max_keys + 1);
+ cur1 = rt_PAGE_FIRST_KEY(page, nod_flag);
+ cur2 = rt_PAGE_FIRST_KEY(new_page, nod_flag);
+
+ n1= n2 = 0;
+ for (cur = task; cur < stop; ++cur)
+ {
+ uchar *to;
+ if (cur->n_node == 1)
+ {
+ to = cur1;
+ cur1 = rt_PAGE_NEXT_KEY(cur1, key_length, nod_flag);
+ ++n1;
+ }
+ else
+ {
+ to = cur2;
+ cur2 = rt_PAGE_NEXT_KEY(cur2, key_length, nod_flag);
+ ++n2;
+ }
+ if (to != cur->key)
+ memcpy(to - nod_flag, cur->key - nod_flag, full_length);
+ }
+
+ maria_putint(page, 2 + n1 * full_length, nod_flag);
+ maria_putint(new_page, 2 + n2 * full_length, nod_flag);
+
+ if ((*new_page_offs= _ma_new(info, keyinfo, DFLT_INIT_HITS)) ==
+ HA_OFFSET_ERROR)
+ err_code= -1;
+ else
+ err_code= _ma_write_keypage(info, keyinfo, *new_page_offs,
+ DFLT_INIT_HITS, new_page);
+ DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs));
+
+ my_afree((uchar*)new_page);
+
+split_err:
+ /**
+ @todo the cast below is useless (coord_buf is uchar*); at the moment we
+ changed all "byte" to "uchar", some casts became useless and should be
+ removed.
+ */
+ my_afree((uchar*) coord_buf);
+ DBUG_RETURN(err_code);
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c
new file mode 100644
index 00000000000..4360e81c550
--- /dev/null
+++ b/storage/maria/ma_rt_test.c
@@ -0,0 +1,473 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA rtree table */
+/* Written by Alex Barkov who has a shared copyright to this code */
+
+
+#include "maria.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+
+#define MAX_REC_LENGTH 1024
+#define ndims 2
+#define KEYALG HA_KEY_ALG_RTREE
+
+static int read_with_pos(MARIA_HA * file, int silent);
+static void create_record(char *record,uint rownr);
+static void create_record1(char *record,uint rownr);
+static void print_record(char * record,my_off_t offs,const char * tail);
+static int run_test(const char *filename);
+
+static double rt_data[]=
+{
+ /*1*/ 0,10,0,10,
+ /*2*/ 5,15,0,10,
+ /*3*/ 0,10,5,15,
+ /*4*/ 10,20,10,20,
+ /*5*/ 0,10,0,10,
+ /*6*/ 5,15,0,10,
+ /*7*/ 0,10,5,15,
+ /*8*/ 10,20,10,20,
+ /*9*/ 0,10,0,10,
+ /*10*/ 5,15,0,10,
+ /*11*/ 0,10,5,15,
+ /*12*/ 10,20,10,20,
+ /*13*/ 0,10,0,10,
+ /*14*/ 5,15,0,10,
+ /*15*/ 0,10,5,15,
+ /*16*/ 10,20,10,20,
+ /*17*/ 5,15,0,10,
+ /*18*/ 0,10,5,15,
+ /*19*/ 10,20,10,20,
+ /*20*/ 0,10,0,10,
+
+ /*1*/ 100,110,0,10,
+ /*2*/ 105,115,0,10,
+ /*3*/ 100,110,5,15,
+ /*4*/ 110,120,10,20,
+ /*5*/ 100,110,0,10,
+ /*6*/ 105,115,0,10,
+ /*7*/ 100,110,5,15,
+ /*8*/ 110,120,10,20,
+ /*9*/ 100,110,0,10,
+ /*10*/ 105,115,0,10,
+ /*11*/ 100,110,5,15,
+ /*12*/ 110,120,10,20,
+ /*13*/ 100,110,0,10,
+ /*14*/ 105,115,0,10,
+ /*15*/ 100,110,5,15,
+ /*16*/ 110,120,10,20,
+ /*17*/ 105,115,0,10,
+ /*18*/ 100,110,5,15,
+ /*19*/ 110,120,10,20,
+ /*20*/ 100,110,0,10,
+ -1
+};
+
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+ MY_INIT(argv[0]);
+ maria_init();
+ exit(run_test("rt_test"));
+}
+
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+ MARIA_COLUMNDEF recinfo[20];
+ MARIA_KEYDEF keyinfo[20];
+ HA_KEYSEG keyseg[20];
+ key_range range;
+
+ int silent=0;
+ int opt_unique=0;
+ int create_flag=0;
+ int key_type=HA_KEYTYPE_DOUBLE;
+ int key_length=8;
+ int null_fields=0;
+ int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 3000;*/
+ int rec_length=0;
+ int uniques=0;
+ int i;
+ int error;
+ int row_count=0;
+ char record[MAX_REC_LENGTH];
+ char read_record[MAX_REC_LENGTH];
+ int upd= 10;
+ ha_rows hrows;
+
+ /* Define a column for NULLs and DEL markers*/
+
+ recinfo[0].type=FIELD_NORMAL;
+ recinfo[0].length=1; /* For NULL bits */
+ rec_length=1;
+
+ /* Define 2*ndims columns for coordinates*/
+
+ for (i=1; i<=2*ndims ;i++){
+ recinfo[i].type=FIELD_NORMAL;
+ recinfo[i].length=key_length;
+ rec_length+=key_length;
+ }
+
+ /* Define a key with 2*ndims segments */
+
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=2*ndims;
+ keyinfo[0].flag=0;
+ keyinfo[0].key_alg=KEYALG;
+
+ for (i=0; i<2*ndims; i++){
+ keyinfo[0].seg[i].type= key_type;
+ keyinfo[0].seg[i].flag=0; /* Things like HA_REVERSE_SORT */
+ keyinfo[0].seg[i].start= (key_length*i)+1;
+ keyinfo[0].seg[i].length=key_length;
+ keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[i].null_pos=0;
+ keyinfo[0].seg[i].language=default_charset_info->number;
+ }
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=10000000;
+
+ if (maria_create(filename,
+ DYNAMIC_RECORD,
+ 1, /* keys */
+ keyinfo,
+ 1+2*ndims+opt_unique, /* columns */
+ recinfo,uniques,&uniquedef,&create_info,create_flag))
+ goto err;
+
+ if (!silent)
+ printf("- Open isam-file\n");
+
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ for (i=0; i<nrecords; i++ )
+ {
+ create_record(record,i);
+ error=maria_write(file,record);
+ print_record(record,maria_position(file),"\n");
+ if (!error)
+ {
+ row_count++;
+ }
+ else
+ {
+ printf("maria_write: %d\n", error);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Reading rows with key\n");
+
+ for (i=0 ; i < nrecords ; i++)
+ {
+ my_errno=0;
+ create_record(record,i);
+
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rkey(file,read_record,0,record+1,0,HA_READ_MBR_EQUAL);
+
+ if (error && error!=HA_ERR_KEY_NOT_FOUND)
+ {
+ printf(" maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ if (error == HA_ERR_KEY_NOT_FOUND)
+ {
+ print_record(record,maria_position(file)," NOT FOUND\n");
+ continue;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ }
+
+ if (!silent)
+ printf("- Deleting rows\n");
+ for (i=0; i < nrecords/4; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"\n");
+
+ error=maria_delete(file,read_record);
+ if (error)
+ {
+ printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ printf("- Updating rows with position\n");
+ for (i=0; i < (nrecords - nrecords/4) ; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"");
+ create_record(record,i+nrecords*upd);
+ printf("\t-> ");
+ print_record(record,maria_position(file),"\n");
+ error=maria_update(file,read_record,record);
+ if (error)
+ {
+ printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+ create_record(record, nrecords*4/5);
+ print_record(record,0," search for\n");
+
+ if ((error=maria_rkey(file,read_record,0,record+1,0,HA_READ_MBR_INTERSECT)))
+ {
+ printf("maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rkey\n");
+ row_count=1;
+
+ for (;;)
+ {
+ if ((error=maria_rnext_same(file,read_record)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext_same\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+ error=maria_rfirst(file,read_record,0);
+ if (error)
+ {
+ printf("maria_rfirst: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ row_count=1;
+ print_record(read_record,maria_position(file)," maria_frirst\n");
+
+ for (i=0;i<nrecords;i++)
+ {
+ if ((error=maria_rnext(file,read_record,0)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_records_in_range()\n");
+
+ create_record1(record, nrecords*4/5);
+ print_record(record,0,"\n");
+
+ range.key= record+1;
+ range.length= 1000; /* Big enough */
+ range.flag= HA_READ_MBR_INTERSECT;
+ hrows= maria_records_in_range(file,0, &range, (key_range*) 0);
+ printf(" %ld rows\n", (long) hrows);
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return 0;
+
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+
+
+static int read_with_pos (MARIA_HA * file,int silent)
+{
+ int error;
+ int i;
+ char read_record[MAX_REC_LENGTH];
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ for (i=0;;i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ return error;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ }
+ return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(char * record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ char * pos;
+ i=(unsigned char)record[0];
+ printf("%02X ",i);
+
+ for( pos=record+1, i=0; i<32; i++,pos++){
+ int b=(unsigned char)*pos;
+ printf("%02X",b);
+ }
+ printf("%s",tail);
+}
+#endif
+
+
+static void print_record(char * record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ char * pos;
+ double c;
+
+ printf(" rec=(%d)",(unsigned char)record[0]);
+ for ( pos=record+1, i=0; i<2*ndims; i++)
+ {
+ memcpy(&c,pos,sizeof(c));
+ float8get(c,pos);
+ printf(" %.14g ",c);
+ pos+=sizeof(c);
+ }
+ printf("pos=%ld",(long int)offs);
+ printf("%s",tail);
+}
+
+
+
+static void create_record1(char *record,uint rownr)
+{
+ int i;
+ char * pos;
+ double c=rownr+10;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=0x01; /* DEL marker */
+
+ for ( pos=record+1, i=0; i<2*ndims; i++)
+ {
+ memcpy(pos,&c,sizeof(c));
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+#ifdef NOT_USED
+
+static void create_record0(char *record,uint rownr)
+{
+ int i;
+ char * pos;
+ double c=rownr+10;
+ double c0=0;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=0x01; /* DEL marker */
+
+ for ( pos=record+1, i=0; i<ndims; i++)
+ {
+ memcpy(pos,&c0,sizeof(c0));
+ float8store(pos,c0);
+ pos+=sizeof(c0);
+ memcpy(pos,&c,sizeof(c));
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+#endif
+
+static void create_record(char *record,uint rownr)
+{
+ int i;
+ char *pos;
+ double *data= rt_data+rownr*4;
+ record[0]=0x01; /* DEL marker */
+ for ( pos=record+1, i=0; i<ndims*2; i++)
+ {
+ float8store(pos,data[i]);
+ pos+=8;
+ }
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+ exit(0);
+}
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c
new file mode 100644
index 00000000000..f9657833fdd
--- /dev/null
+++ b/storage/maria/ma_scan.c
@@ -0,0 +1,60 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read through all rows sequntially */
+
+#include "maria_def.h"
+
+int maria_scan_init(register MARIA_HA *info)
+{
+ DBUG_ENTER("maria_scan_init");
+
+ info->cur_row.nextpos= info->s->pack.header_length; /* Read first record */
+ info->lastinx= -1; /* Can't forward or backward */
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+
+ if ((*info->s->scan_init)(info))
+ DBUG_RETURN(my_errno);
+ DBUG_RETURN(0);
+}
+
+/*
+ Read a row based on position.
+
+ SYNOPSIS
+ maria_scan()
+ info Maria handler
+ record Read data here
+
+ RETURN
+ 0 ok
+ HA_ERR_END_OF_FILE End of file
+ # Error code
+*/
+
+int maria_scan(MARIA_HA *info, uchar *record)
+{
+ DBUG_ENTER("maria_scan");
+ /* Init all but update-flag */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1));
+}
+
+
+void maria_scan_end(MARIA_HA *info)
+{
+ (*info->s->scan_end)(info);
+}
diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c
new file mode 100644
index 00000000000..8cb3e56e646
--- /dev/null
+++ b/storage/maria/ma_search.c
@@ -0,0 +1,1934 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* key handling functions */
+
+#include "ma_fulltext.h"
+#include "m_ctype.h"
+
+static my_bool _ma_get_prev_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page,
+ uchar *key, uchar *keypos,
+ uint *return_key_length);
+
+ /* Check index */
+
+int _ma_check_index(MARIA_HA *info, int inx)
+{
+ if (inx == -1) /* Use last index */
+ inx=info->lastinx;
+ if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx))
+ {
+ my_errno=HA_ERR_WRONG_INDEX;
+ return -1;
+ }
+ if (info->lastinx != inx) /* Index changed */
+ {
+ info->lastinx = inx;
+ info->page_changed=1;
+ info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) |
+ HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND);
+ }
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ return(-1);
+ return(inx);
+} /* _ma_check_index */
+
+
+ /*
+ ** Search after row by a key
+ ** Position to row is stored in info->lastpos
+ ** Return: -1 if not found
+ ** 1 if one should continue search on higher level
+ */
+
+int _ma_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_len, uint nextflag, register my_off_t pos)
+{
+ my_bool last_key;
+ int error,flag;
+ uint nod_flag;
+ uchar *keypos,*maxpos;
+ uchar lastkey[HA_MAX_KEY_BUFF],*buff;
+ DBUG_ENTER("_ma_search");
+ DBUG_PRINT("enter",("pos: %lu nextflag: %u lastpos: %lu",
+ (ulong) pos, nextflag, (ulong) info->cur_row.lastpos));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key,key_len););
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST)))
+ DBUG_RETURN(-1); /* Not found ; return error */
+ DBUG_RETURN(1); /* Search at upper levels */
+ }
+
+ if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,
+ info->keyread_buff,
+ test(!(nextflag & SEARCH_SAVE_BUFF)))))
+ goto err;
+ DBUG_DUMP("page", buff, maria_data_on_page(buff));
+
+ flag=(*keyinfo->bin_search)(info,keyinfo,buff,key,key_len,nextflag,
+ &keypos,lastkey, &last_key);
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ DBUG_RETURN(-1);
+ nod_flag=_ma_test_if_nod(buff);
+ maxpos=buff+maria_data_on_page(buff)-1;
+
+ if (flag)
+ {
+ if ((error= _ma_search(info,keyinfo,key,key_len,nextflag,
+ _ma_kpos(nod_flag,keypos))) <= 0)
+ DBUG_RETURN(error);
+
+ if (flag >0)
+ {
+ if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) &&
+ keypos == buff+2+nod_flag)
+ DBUG_RETURN(1); /* Bigger than key */
+ }
+ else if (nextflag & SEARCH_BIGGER && keypos >= maxpos)
+ DBUG_RETURN(1); /* Smaller than key */
+ }
+ else
+ {
+ if ((nextflag & SEARCH_FIND) && nod_flag &&
+ ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+ key_len != USE_WHOLE_KEY))
+ {
+ if ((error= _ma_search(info,keyinfo,key,key_len,SEARCH_FIND,
+ _ma_kpos(nod_flag,keypos))) >= 0 ||
+ my_errno != HA_ERR_KEY_NOT_FOUND)
+ DBUG_RETURN(error);
+ info->last_keypage= HA_OFFSET_ERROR; /* Buffer not in mem */
+ }
+ }
+ if (pos != info->last_keypage)
+ {
+ uchar *old_buff=buff;
+ if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,
+ info->keyread_buff,
+ test(!(nextflag & SEARCH_SAVE_BUFF)))))
+ goto err;
+ keypos=buff+(keypos-old_buff);
+ maxpos=buff+(maxpos-old_buff);
+ }
+
+ if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0)
+ {
+ uint not_used[2];
+ if (_ma_get_prev_key(info,keyinfo, buff, info->lastkey, keypos,
+ &info->lastkey_length))
+ goto err;
+ if (!(nextflag & SEARCH_SMALLER) &&
+ ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key, key_len,
+ SEARCH_FIND, not_used))
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ goto err;
+ }
+ }
+ else
+ {
+ info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey);
+ if (!info->lastkey_length)
+ goto err;
+ memcpy(info->lastkey,lastkey,info->lastkey_length);
+ }
+ info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length);
+ /* Save position for a possible read next / previous */
+ info->int_keypos= info->keyread_buff+ (keypos-buff);
+ info->int_maxpos= info->keyread_buff+ (maxpos-buff);
+ info->int_nod_flag=nod_flag;
+ info->int_keytree_version=keyinfo->version;
+ info->last_search_keypage=info->last_keypage;
+ info->page_changed=0;
+ info->keyread_buff_used= (info->keyread_buff != buff); /* If we have to reread */
+
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed=1;
+ DBUG_RETURN (-1);
+} /* _ma_search */
+
+
+ /* Search after key in page-block */
+ /* If packed key puts smaller or identical key in buff */
+ /* ret_pos point to where find or bigger key starts */
+ /* ARGSUSED */
+
+int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint key_len, uint comp_flag, uchar **ret_pos,
+ uchar *buff __attribute__((unused)), my_bool *last_key)
+{
+ reg4 int start,mid,end,save_end;
+ int flag;
+ uint totlength,nod_flag,not_used[2];
+ DBUG_ENTER("_ma_bin_search");
+
+ LINT_INIT(flag);
+ totlength=keyinfo->keylength+(nod_flag=_ma_test_if_nod(page));
+ start=0; mid=1;
+ save_end=end=(int) ((maria_data_on_page(page)-2-nod_flag)/totlength-1);
+ DBUG_PRINT("test",("page_length: %d end: %d",maria_data_on_page(page),end));
+ page+=2+nod_flag;
+
+ while (start != end)
+ {
+ mid= (start+end)/2;
+ if ((flag=ha_key_cmp(keyinfo->seg,(uchar*) page+(uint) mid*totlength,
+ (uchar*) key, key_len, comp_flag, not_used))
+ >= 0)
+ end=mid;
+ else
+ start=mid+1;
+ }
+ if (mid != start)
+ flag=ha_key_cmp(keyinfo->seg, (uchar*) page+(uint) start*totlength,
+ (uchar*) key, key_len, comp_flag, not_used);
+ if (flag < 0)
+ start++; /* point at next, bigger key */
+ *ret_pos=page+(uint) start*totlength;
+ *last_key= end == save_end;
+ DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start));
+ DBUG_RETURN(flag);
+} /* _ma_bin_search */
+
+
+/*
+ Locate a packed key in a key page.
+
+ SYNOPSIS
+ _ma_seq_search()
+ info Open table information.
+ keyinfo Key definition information.
+ page Key page (beginning).
+ key Search key.
+ key_len Length to use from search key or USE_WHOLE_KEY
+ comp_flag Search flags like SEARCH_SAME etc.
+ ret_pos RETURN Position in key page behind this key.
+ buff RETURN Copy of previous or identical unpacked key.
+ last_key RETURN If key is last in page.
+
+ DESCRIPTION
+ Used instead of _ma_bin_search() when key is packed.
+ Puts smaller or identical key in buff.
+ Key is searched sequentially.
+
+ RETURN
+ > 0 Key in 'buff' is smaller than search key.
+ 0 Key in 'buff' is identical to search key.
+ < 0 Not found.
+*/
+
+int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint key_len, uint comp_flag, uchar **ret_pos,
+ uchar *buff, my_bool *last_key)
+{
+ int flag;
+ uint nod_flag,length,not_used[2];
+ uchar t_buff[HA_MAX_KEY_BUFF],*end;
+ DBUG_ENTER("_ma_seq_search");
+
+ LINT_INIT(flag); LINT_INIT(length);
+ end= page+maria_data_on_page(page);
+ nod_flag=_ma_test_if_nod(page);
+ page+=2+nod_flag;
+ *ret_pos=page;
+ t_buff[0]=0; /* Avoid bugs */
+ while (page < end)
+ {
+ length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,t_buff);
+ if (length == 0 || page > end)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_PRINT("error",
+ ("Found wrong key: length: %u page: 0x%lx end: 0x%lx",
+ length, (long) page, (long) end));
+ DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+ }
+ if ((flag= ha_key_cmp(keyinfo->seg, (uchar*) t_buff,(uchar*) key,
+ key_len,comp_flag, not_used)) >= 0)
+ break;
+#ifdef EXTRA_DEBUG
+ DBUG_PRINT("loop",("page: 0x%lx key: '%s' flag: %d", (long) page, t_buff,
+ flag));
+#endif
+ memcpy(buff,t_buff,length);
+ *ret_pos=page;
+ }
+ if (flag == 0)
+ memcpy(buff,t_buff,length); /* Result is first key */
+ *last_key= page == end;
+ DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos));
+ DBUG_RETURN(flag);
+} /* _ma_seq_search */
+
+
+int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uint key_len, uint nextflag,
+ uchar **ret_pos, uchar *buff, my_bool *last_key)
+{
+ /*
+ my_flag is raw comparison result to be changed according to
+ SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags.
+ flag is the value returned by ha_key_cmp and as treated as final
+ */
+ int flag=0, my_flag=-1;
+ uint nod_flag, length, len, matched, cmplen, kseg_len;
+ uint prefix_len,suffix_len;
+ int key_len_skip, seg_len_pack, key_len_left;
+ uchar *end;
+ uchar *kseg, *vseg, *saved_vseg, *saved_from;
+ uchar *sort_order= keyinfo->seg->charset->sort_order;
+ uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
+ uchar *saved_to;
+ uint saved_length=0, saved_prefix_len=0;
+ uint length_pack;
+ DBUG_ENTER("_ma_prefix_search");
+
+ LINT_INIT(length);
+ LINT_INIT(prefix_len);
+ LINT_INIT(seg_len_pack);
+ LINT_INIT(saved_from);
+ LINT_INIT(saved_to);
+ LINT_INIT(saved_vseg);
+
+ t_buff[0]=0; /* Avoid bugs */
+ end= page+maria_data_on_page(page);
+ nod_flag=_ma_test_if_nod(page);
+ page+=2+nod_flag;
+ *ret_pos=page;
+ kseg= (uchar*) key;
+
+ get_key_pack_length(kseg_len, length_pack, kseg);
+ key_len_skip=length_pack+kseg_len;
+ key_len_left=(int) key_len- (int) key_len_skip;
+ /* If key_len is 0, then lenght_pack is 1, then key_len_left is -1. */
+ cmplen=(key_len_left>=0) ? kseg_len : key_len-length_pack;
+ DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg));
+
+ /*
+ Keys are compressed the following way:
+
+ If the max length of first key segment <= 127 bytes the prefix is
+ 1 uchar else it's 2 byte
+
+ (prefix) length The high bit is set if this is a prefix for the prev key.
+ [suffix length] Packed length of suffix if the previous was a prefix.
+ (suffix) data Key data bytes (past the common prefix or whole segment).
+ [next-key-seg] Next key segments (([packed length], data), ...)
+ pointer Reference to the data file (last_keyseg->length).
+ */
+
+ matched=0; /* how many char's from prefix were alredy matched */
+ len=0; /* length of previous key unpacked */
+
+ while (page < end)
+ {
+ uint packed= *page & 128;
+
+ vseg= (uchar*) page;
+ if (keyinfo->seg->length >= 127)
+ {
+ suffix_len=mi_uint2korr(vseg) & 32767;
+ vseg+=2;
+ }
+ else
+ suffix_len= *vseg++ & 127;
+
+ if (packed)
+ {
+ if (suffix_len == 0)
+ {
+ /* == 0x80 or 0x8000, same key, prefix length == old key length. */
+ prefix_len=len;
+ }
+ else
+ {
+ /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */
+ prefix_len=suffix_len;
+ get_key_length(suffix_len,vseg);
+ }
+ }
+ else
+ {
+ /* Not packed. No prefix used from last key. */
+ prefix_len=0;
+ }
+
+ len=prefix_len+suffix_len;
+ seg_len_pack=get_pack_length(len);
+ t_buff=tt_buff+3-seg_len_pack;
+ store_key_length(t_buff,len);
+
+ if (prefix_len > saved_prefix_len)
+ memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg,
+ prefix_len-saved_prefix_len);
+ saved_vseg=vseg;
+ saved_prefix_len=prefix_len;
+
+ DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack,
+ suffix_len,vseg));
+ {
+ uchar *from= vseg+suffix_len;
+ HA_KEYSEG *keyseg;
+ uint l;
+
+ for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ )
+ {
+
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!(*from++))
+ continue;
+ }
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ get_key_length(l,from);
+ }
+ else
+ l=keyseg->length;
+
+ from+=l;
+ }
+ from+= keyseg->length;
+ page= (uchar*) from+nod_flag;
+ length= (uint) (from-vseg);
+ }
+
+ if (page > end)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_PRINT("error",
+ ("Found wrong key: length: %u page: 0x%lx end: %lx",
+ length, (long) page, (long) end));
+ DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+ }
+
+ if (matched >= prefix_len)
+ {
+ /* We have to compare. But we can still skip part of the key */
+ uint left;
+ uchar *k= kseg+prefix_len;
+
+ /*
+ If prefix_len > cmplen then we are in the end-space comparison
+ phase. Do not try to acces the key any more ==> left= 0.
+ */
+ left= ((len <= cmplen) ? suffix_len :
+ ((prefix_len < cmplen) ? cmplen - prefix_len : 0));
+
+ matched=prefix_len+left;
+
+ if (sort_order)
+ {
+ for (my_flag=0;left;left--)
+ if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++]))
+ break;
+ }
+ else
+ {
+ for (my_flag=0;left;left--)
+ if ((my_flag= (int) *vseg++ - (int) *k++))
+ break;
+ }
+
+ if (my_flag>0) /* mismatch */
+ break;
+ if (my_flag==0) /* match */
+ {
+ /*
+ ** len cmplen seg_left_len more_segs
+ ** < matched=len; continue search
+ ** > = prefix ? found : (matched=len; continue search)
+ ** > < - ok, found
+ ** = < - ok, found
+ ** = = - ok, found
+ ** = = + next seg
+ */
+ if (len < cmplen)
+ {
+ if ((keyinfo->seg->type != HA_KEYTYPE_TEXT &&
+ keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 &&
+ keyinfo->seg->type != HA_KEYTYPE_VARTEXT2))
+ my_flag= -1;
+ else
+ {
+ /* We have to compare k and vseg as if they were space extended */
+ uchar *k_end= k+ (cmplen - len);
+ for ( ; k < k_end && *k == ' '; k++) ;
+ if (k == k_end)
+ goto cmp_rest; /* should never happen */
+ if ((uchar) *k < (uchar) ' ')
+ {
+ my_flag= 1; /* Compared string is smaller */
+ break;
+ }
+ my_flag= -1; /* Continue searching */
+ }
+ }
+ else if (len > cmplen)
+ {
+ uchar *vseg_end;
+ if ((nextflag & SEARCH_PREFIX) && key_len_left == 0)
+ goto fix_flag;
+
+ /* We have to compare k and vseg as if they were space extended */
+ for (vseg_end= vseg + (len-cmplen) ;
+ vseg < vseg_end && *vseg == (uchar) ' ';
+ vseg++, matched++) ;
+ DBUG_ASSERT(vseg < vseg_end);
+
+ if ((uchar) *vseg > (uchar) ' ')
+ {
+ my_flag= 1; /* Compared string is smaller */
+ break;
+ }
+ my_flag= -1; /* Continue searching */
+ }
+ else
+ {
+ cmp_rest:
+ if (key_len_left>0)
+ {
+ uint not_used[2];
+ if ((flag = ha_key_cmp(keyinfo->seg+1,vseg,
+ k, key_len_left, nextflag, not_used)) >= 0)
+ break;
+ }
+ else
+ {
+ /*
+ at this line flag==-1 if the following lines were already
+ visited and 0 otherwise, i.e. flag <=0 here always !!!
+ */
+ fix_flag:
+ DBUG_ASSERT(flag <= 0);
+ if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST))
+ flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1;
+ if (flag>=0)
+ break;
+ }
+ }
+ }
+ matched-=left;
+ }
+ /* else (matched < prefix_len) ---> do nothing. */
+
+ memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+ saved_to= buff+saved_length;
+ saved_from= saved_vseg;
+ saved_length=length;
+ *ret_pos=page;
+ }
+ if (my_flag)
+ flag=(keyinfo->seg->flag & HA_REVERSE_SORT) ? -my_flag : my_flag;
+ if (flag == 0)
+ {
+ memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+ saved_to= buff+saved_length;
+ saved_from= saved_vseg;
+ saved_length=length;
+ }
+ if (saved_length)
+ memcpy(saved_to, (uchar*) saved_from, saved_length);
+
+ *last_key= page == end;
+
+ DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos));
+ DBUG_RETURN(flag);
+} /* _ma_prefix_search */
+
+
+ /* Get pos to a key_block */
+
+my_off_t _ma_kpos(uint nod_flag, uchar *after_key)
+{
+ after_key-=nod_flag;
+ switch (nod_flag) {
+#if SIZEOF_OFF_T > 4
+ case 7:
+ return mi_uint7korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH;
+ case 6:
+ return mi_uint6korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH;
+ case 5:
+ return mi_uint5korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH;
+#else
+ case 7:
+ after_key++;
+ case 6:
+ after_key++;
+ case 5:
+ after_key++;
+#endif
+ case 4:
+ return ((my_off_t) mi_uint4korr(after_key))*MARIA_MIN_KEY_BLOCK_LENGTH;
+ case 3:
+ return ((my_off_t) mi_uint3korr(after_key))*MARIA_MIN_KEY_BLOCK_LENGTH;
+ case 2:
+ return (my_off_t) (mi_uint2korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH);
+ case 1:
+ return (uint) (*after_key)*MARIA_MIN_KEY_BLOCK_LENGTH;
+ case 0: /* At leaf page */
+ default: /* Impossible */
+ return(HA_OFFSET_ERROR);
+ }
+} /* _kpos */
+
+
+ /* Save pos to a key_block */
+
+void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos)
+{
+ pos/=MARIA_MIN_KEY_BLOCK_LENGTH;
+ switch (info->s->base.key_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 7: mi_int7store(buff,pos); break;
+ case 6: mi_int6store(buff,pos); break;
+ case 5: mi_int5store(buff,pos); break;
+#else
+ case 7: *buff++=0;
+ /* fall trough */
+ case 6: *buff++=0;
+ /* fall trough */
+ case 5: *buff++=0;
+ /* fall trough */
+#endif
+ case 4: mi_int4store(buff,pos); break;
+ case 3: mi_int3store(buff,pos); break;
+ case 2: mi_int2store(buff,(uint) pos); break;
+ case 1: buff[0]= (uchar) pos; break;
+ default: abort(); /* impossible */
+ }
+} /* _ma_kpointer */
+
+
+ /* Calc pos to a data-record from a key */
+
+
+my_off_t _ma_dpos(MARIA_HA *info, uint nod_flag, const uchar *after_key)
+{
+ my_off_t pos;
+ after_key-=(nod_flag + info->s->rec_reflength);
+ switch (info->s->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8: pos= (my_off_t) mi_uint8korr(after_key); break;
+ case 7: pos= (my_off_t) mi_uint7korr(after_key); break;
+ case 6: pos= (my_off_t) mi_uint6korr(after_key); break;
+ case 5: pos= (my_off_t) mi_uint5korr(after_key); break;
+#else
+ case 8: pos= (my_off_t) mi_uint4korr(after_key+4); break;
+ case 7: pos= (my_off_t) mi_uint4korr(after_key+3); break;
+ case 6: pos= (my_off_t) mi_uint4korr(after_key+2); break;
+ case 5: pos= (my_off_t) mi_uint4korr(after_key+1); break;
+#endif
+ case 4: pos= (my_off_t) mi_uint4korr(after_key); break;
+ case 3: pos= (my_off_t) mi_uint3korr(after_key); break;
+ case 2: pos= (my_off_t) mi_uint2korr(after_key); break;
+ default:
+ pos=0L; /* Shut compiler up */
+ }
+ return ((info->s->data_file_type == STATIC_RECORD) ?
+ pos * info->s->base.pack_reclength : pos);
+}
+
+
+/* Calc position from a record pointer ( in delete link chain ) */
+
+my_off_t _ma_rec_pos(MARIA_SHARE *s, uchar *ptr)
+{
+ my_off_t pos;
+ switch (s->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8:
+ pos= (my_off_t) mi_uint8korr(ptr);
+ if (pos == HA_OFFSET_ERROR)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 7:
+ pos= (my_off_t) mi_uint7korr(ptr);
+ if (pos == (((my_off_t) 1) << 56) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 6:
+ pos= (my_off_t) mi_uint6korr(ptr);
+ if (pos == (((my_off_t) 1) << 48) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 5:
+ pos= (my_off_t) mi_uint5korr(ptr);
+ if (pos == (((my_off_t) 1) << 40) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+#else
+ case 8:
+ case 7:
+ case 6:
+ case 5:
+ ptr+= (s->rec_reflength-4);
+ /* fall through */
+#endif
+ case 4:
+ pos= (my_off_t) mi_uint4korr(ptr);
+ if (pos == (my_off_t) (uint32) ~0L)
+ return HA_OFFSET_ERROR;
+ break;
+ case 3:
+ pos= (my_off_t) mi_uint3korr(ptr);
+ if (pos == (my_off_t) (1 << 24) -1)
+ return HA_OFFSET_ERROR;
+ break;
+ case 2:
+ pos= (my_off_t) mi_uint2korr(ptr);
+ if (pos == (my_off_t) (1 << 16) -1)
+ return HA_OFFSET_ERROR;
+ break;
+ default: abort(); /* Impossible */
+ }
+ return ((s->data_file_type == STATIC_RECORD) ?
+ pos * s->base.pack_reclength : pos);
+}
+
+
+ /* save position to record */
+
+void _ma_dpointer(MARIA_HA *info, uchar *buff, my_off_t pos)
+{
+ if (info->s->data_file_type == STATIC_RECORD &&
+ pos != HA_OFFSET_ERROR)
+ pos/= info->s->base.pack_reclength;
+
+ switch (info->s->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8: mi_int8store(buff,pos); break;
+ case 7: mi_int7store(buff,pos); break;
+ case 6: mi_int6store(buff,pos); break;
+ case 5: mi_int5store(buff,pos); break;
+#else
+ case 8: *buff++=0;
+ /* fall trough */
+ case 7: *buff++=0;
+ /* fall trough */
+ case 6: *buff++=0;
+ /* fall trough */
+ case 5: *buff++=0;
+ /* fall trough */
+#endif
+ case 4: mi_int4store(buff,pos); break;
+ case 3: mi_int3store(buff,pos); break;
+ case 2: mi_int2store(buff,(uint) pos); break;
+ default: abort(); /* Impossible */
+ }
+} /* _ma_dpointer */
+
+
+ /* Get key from key-block */
+ /* page points at previous key; its advanced to point at next key */
+ /* key should contain previous key */
+ /* Returns length of found key + pointers */
+ /* nod_flag is a flag if we are on nod */
+
+ /* same as _ma_get_key but used with fixed length keys */
+
+uint _ma_get_static_key(register MARIA_KEYDEF *keyinfo, uint nod_flag,
+ register uchar **page, register uchar *key)
+{
+ memcpy((uchar*) key,(uchar*) *page,
+ (size_t) (keyinfo->keylength+nod_flag));
+ *page+=keyinfo->keylength+nod_flag;
+ return(keyinfo->keylength);
+} /* _ma_get_static_key */
+
+
+/*
+ get key witch is packed against previous key or key with a NULL column.
+
+ SYNOPSIS
+ _ma_get_pack_key()
+ keyinfo key definition information.
+ nod_flag If nod: Length of node pointer, else zero.
+ page_pos RETURN position in key page behind this key.
+ key IN/OUT in: prev key, out: unpacked key.
+
+ RETURN
+ key_length + length of data pointer
+*/
+
+uint _ma_get_pack_key(register MARIA_KEYDEF *keyinfo, uint nod_flag,
+ register uchar **page_pos, register uchar *key)
+{
+ reg1 HA_KEYSEG *keyseg;
+ uchar *start_key,*page=*page_pos;
+ uint length;
+
+ start_key=key;
+ for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->flag & HA_PACK_KEY)
+ {
+ /* key with length, packed to previous key */
+ uchar *start= key;
+ uint packed= *page & 128,tot_length,rest_length;
+ if (keyseg->length >= 127)
+ {
+ length=mi_uint2korr(page) & 32767;
+ page+=2;
+ }
+ else
+ length= *page++ & 127;
+
+ if (packed)
+ {
+ if (length > (uint) keyseg->length)
+ {
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0; /* Error */
+ }
+ if (length == 0) /* Same key */
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ *key++=1; /* Can't be NULL */
+ get_key_length(length,key);
+ key+= length; /* Same diff_key as prev */
+ if (length > keyseg->length)
+ {
+ DBUG_PRINT("error",
+ ("Found too long null packed key: %u of %u at 0x%lx",
+ length, keyseg->length, (long) *page_pos));
+ DBUG_DUMP("key",(char*) *page_pos,16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0;
+ }
+ continue;
+ }
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ key++; /* Skip null marker*/
+ start++;
+ }
+
+ get_key_length(rest_length,page);
+ tot_length=rest_length+length;
+
+ /* If the stored length has changed, we must move the key */
+ if (tot_length >= 255 && *start != 255)
+ {
+ /* length prefix changed from a length of one to a length of 3 */
+ bmove_upp((char*) key+length+3,(char*) key+length+1,length);
+ *key=255;
+ mi_int2store(key+1,tot_length);
+ key+=3+length;
+ }
+ else if (tot_length < 255 && *start == 255)
+ {
+ bmove(key+1,key+3,length);
+ *key=tot_length;
+ key+=1+length;
+ }
+ else
+ {
+ store_key_length_inc(key,tot_length);
+ key+=length;
+ }
+ memcpy(key,page,rest_length);
+ page+=rest_length;
+ key+=rest_length;
+ continue;
+ }
+ else
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!length--) /* Null part */
+ {
+ *key++=0;
+ continue;
+ }
+ *key++=1; /* Not null */
+ }
+ }
+ if (length > (uint) keyseg->length)
+ {
+ DBUG_PRINT("error",("Found too long packed key: %u of %u at 0x%lx",
+ length, keyseg->length, (long) *page_pos));
+ DBUG_DUMP("key",(char*) *page_pos,16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0; /* Error */
+ }
+ store_key_length_inc(key,length);
+ }
+ else
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!(*key++ = *page++))
+ continue;
+ }
+ if (keyseg->flag &
+ (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ uchar *tmp=page;
+ get_key_length(length,tmp);
+ length+=(uint) (tmp-page);
+ }
+ else
+ length=keyseg->length;
+ }
+ memcpy((uchar*) key,(uchar*) page,(size_t) length);
+ key+=length;
+ page+=length;
+ }
+ length=keyseg->length+nod_flag;
+ bmove((uchar*) key,(uchar*) page,length);
+ *page_pos= page+length;
+ return ((uint) (key-start_key)+keyseg->length);
+} /* _ma_get_pack_key */
+
+
+
+/* key that is packed relatively to previous */
+
+uint _ma_get_binary_pack_key(register MARIA_KEYDEF *keyinfo, uint nod_flag,
+ register uchar **page_pos, register uchar *key)
+{
+ reg1 HA_KEYSEG *keyseg;
+ uchar *start_key,*page,*page_end,*from,*from_end;
+ uint length,tmp;
+ DBUG_ENTER("_ma_get_binary_pack_key");
+
+ page= *page_pos;
+ page_end=page+HA_MAX_KEY_BUFF+1;
+ start_key=key;
+
+ /*
+ Keys are compressed the following way:
+
+ prefix length Packed length of prefix common with prev key. (1 or 3 bytes)
+ for each key segment:
+ [is null] Null indicator if can be null (1 byte, zero means null)
+ [length] Packed length if varlength (1 or 3 bytes)
+ key segment 'length' bytes of key segment value
+ pointer Reference to the data file (last_keyseg->length).
+
+ get_key_length() is a macro. It gets the prefix length from 'page'
+ and puts it into 'length'. It increments 'page' by 1 or 3, depending
+ on the packed length of the prefix length.
+ */
+ get_key_length(length,page);
+ if (length)
+ {
+ if (length > keyinfo->maxlength)
+ {
+ DBUG_PRINT("error",
+ ("Found too long binary packed key: %u of %u at 0x%lx",
+ length, keyinfo->maxlength, (long) *page_pos));
+ DBUG_DUMP("key",(char*) *page_pos,16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0); /* Wrong key */
+ }
+ /* Key is packed against prev key, take prefix from prev key. */
+ from= key;
+ from_end= key + length;
+ }
+ else
+ {
+ /* Key is not packed against prev key, take all from page buffer. */
+ from= page;
+ from_end= page_end;
+ }
+
+ /*
+ The trouble is that key can be split in two parts:
+ The first part (prefix) is in from .. from_end - 1.
+ The second part starts at page.
+ The split can be at every byte position. So we need to check for
+ the end of the first part before using every byte.
+ */
+ for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ if (!(*key++ = *from++))
+ continue; /* Null part */
+ }
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ /* Get length of dynamic length key part */
+ if ((length= (uint) (uchar) (*key++ = *from++)) == 255)
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ length= ((uint) (uchar) ((*key++ = *from++))) << 8;
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ length+= (uint) (uchar) ((*key++ = *from++));
+ }
+ }
+ else
+ length=keyseg->length;
+
+ if ((tmp=(uint) (from_end-from)) <= length)
+ {
+ key+=tmp; /* Use old key */
+ length-=tmp;
+ from=page; from_end=page_end;
+ }
+ DBUG_ASSERT((int) length >= 0);
+ DBUG_PRINT("info",("key: 0x%lx from: 0x%lx length: %u",
+ (long) key, (long) from, length));
+ memmove((uchar*) key, (uchar*) from, (size_t) length);
+ key+=length;
+ from+=length;
+ }
+ /*
+ Last segment (type == 0) contains length of data pointer.
+ If we have mixed key blocks with data pointer and key block pointer,
+ we have to copy both.
+ */
+ length=keyseg->length+nod_flag;
+ if ((tmp=(uint) (from_end-from)) <= length)
+ {
+ /* Remaining length is less or equal max possible length. */
+ memcpy(key+tmp,page,length-tmp); /* Get last part of key */
+ *page_pos= page+length-tmp;
+ }
+ else
+ {
+ /*
+ Remaining length is greater than max possible length.
+ This can happen only if we switched to the new key bytes already.
+ 'page_end' is calculated with MI_MAX_KEY_BUFF. So it can be far
+ behind the real end of the key.
+ */
+ if (from_end != page_end)
+ {
+ DBUG_PRINT("error",("Error when unpacking key"));
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0); /* Error */
+ }
+ /* Copy data pointer and, if appropriate, key block pointer. */
+ memcpy((uchar*) key,(uchar*) from,(size_t) length);
+ *page_pos= from+length;
+ }
+ DBUG_RETURN((uint) (key-start_key)+keyseg->length);
+}
+
+
+ /* Get key at position without knowledge of previous key */
+ /* Returns pointer to next key */
+
+uchar *_ma_get_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uchar *keypos, uint *return_key_length)
+{
+ uint nod_flag;
+ DBUG_ENTER("_ma_get_key");
+
+ nod_flag=_ma_test_if_nod(page);
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ {
+ bmove((uchar*) key,(uchar*) keypos,keyinfo->keylength+nod_flag);
+ DBUG_RETURN(keypos+keyinfo->keylength+nod_flag);
+ }
+ else
+ {
+ page+=2+nod_flag;
+ key[0]=0; /* safety */
+ while (page <= keypos)
+ {
+ *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key);
+ if (*return_key_length == 0)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ }
+ }
+ DBUG_PRINT("exit",("page: 0x%lx length: %u", (long) page,
+ *return_key_length));
+ DBUG_RETURN(page);
+} /* _ma_get_key */
+
+
+ /* Get key at position without knowledge of previous key */
+ /* Returns 0 if ok */
+
+static my_bool _ma_get_prev_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uchar *keypos,
+ uint *return_key_length)
+{
+ uint nod_flag;
+ DBUG_ENTER("_ma_get_prev_key");
+
+ nod_flag=_ma_test_if_nod(page);
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ {
+ *return_key_length=keyinfo->keylength;
+ bmove((uchar*) key,(uchar*) keypos- *return_key_length-nod_flag,
+ *return_key_length);
+ DBUG_RETURN(0);
+ }
+ else
+ {
+ page+=2+nod_flag;
+ key[0]=0; /* safety */
+ while (page < keypos)
+ {
+ *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key);
+ if (*return_key_length == 0)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ DBUG_RETURN(0);
+} /* _ma_get_key */
+
+
+
+ /* Get last key from key-page */
+ /* Return pointer to where key starts */
+
+uchar *_ma_get_last_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *lastkey, uchar *endpos, uint *return_key_length)
+{
+ uint nod_flag;
+ uchar *lastpos;
+ DBUG_ENTER("_ma_get_last_key");
+ DBUG_PRINT("enter",("page: 0x%lx endpos: 0x%lx", (long) page,
+ (long) endpos));
+
+ nod_flag=_ma_test_if_nod(page);
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ {
+ lastpos=endpos-keyinfo->keylength-nod_flag;
+ *return_key_length=keyinfo->keylength;
+ if (lastpos > page)
+ bmove((uchar*) lastkey,(uchar*) lastpos,keyinfo->keylength+nod_flag);
+ }
+ else
+ {
+ lastpos=(page+=2+nod_flag);
+ lastkey[0]=0;
+ while (page < endpos)
+ {
+ lastpos=page;
+ *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,lastkey);
+ if (*return_key_length == 0)
+ {
+ DBUG_PRINT("error",("Couldn't find last key: page: 0x%lx",
+ (long) page));
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ }
+ }
+ DBUG_PRINT("exit",("lastpos: 0x%lx length: %u", (long) lastpos,
+ *return_key_length));
+ DBUG_RETURN(lastpos);
+} /* _ma_get_last_key */
+
+
+ /* Calculate length of key */
+
+uint _ma_keylength(MARIA_KEYDEF *keyinfo, register const uchar *key)
+{
+ reg1 HA_KEYSEG *keyseg;
+ const uchar *start;
+
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ return (keyinfo->keylength);
+
+ start= key;
+ for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ if (!*key++)
+ continue;
+ if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint length;
+ get_key_length(length,key);
+ key+=length;
+ }
+ else
+ key+= keyseg->length;
+ }
+ return((uint) (key-start)+keyseg->length);
+} /* _ma_keylength */
+
+
+/*
+ Calculate length of part key.
+
+ Used in maria_rkey() to find the key found for the key-part that was used.
+ This is needed in case of multi-byte character sets where we may search
+ after '0xDF' but find 'ss'
+*/
+
+uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key,
+ HA_KEYSEG *end)
+{
+ reg1 HA_KEYSEG *keyseg;
+ const uchar *start= key;
+
+ for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ if (!*key++)
+ continue;
+ if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint length;
+ get_key_length(length,key);
+ key+=length;
+ }
+ else
+ key+= keyseg->length;
+ }
+ return (uint) (key-start);
+}
+
+
+/* Move a key */
+
+uchar *_ma_move_key(MARIA_KEYDEF *keyinfo, uchar *to, const uchar *from)
+{
+ reg1 uint length;
+ memcpy(to, from, (size_t) (length= _ma_keylength(keyinfo, from)));
+ return to+length;
+}
+
+
+/*
+ Find next/previous record with same key
+
+ WARNING
+ This can't be used when database is touched after last read
+*/
+
+int _ma_search_next(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length, uint nextflag, my_off_t pos)
+{
+ int error;
+ uint nod_flag;
+ uchar lastkey[HA_MAX_KEY_BUFF];
+ DBUG_ENTER("_ma_search_next");
+ DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: %lu page_changed %d keyread_buff_used: %d",
+ nextflag, (ulong) info->cur_row.lastpos,
+ (ulong) info->int_keypos,
+ info->page_changed, info->keyread_buff_used));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key,key_length););
+
+ /* Force full read if we are at last key or if we are not on a leaf
+ and the key tree has changed since we used it last time
+ Note that even if the key tree has changed since last read, we can use
+ the last read data from the leaf if we haven't used the buffer for
+ something else.
+ */
+
+ if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) ||
+ info->page_changed ||
+ (info->int_keytree_version != keyinfo->version &&
+ (info->int_nod_flag || info->keyread_buff_used)))
+ DBUG_RETURN(_ma_search(info,keyinfo,key, USE_WHOLE_KEY,
+ nextflag | SEARCH_SAVE_BUFF, pos));
+
+ if (info->keyread_buff_used)
+ {
+ if (!_ma_fetch_keypage(info,keyinfo,info->last_search_keypage,
+ DFLT_INIT_HITS,info->keyread_buff,0))
+ DBUG_RETURN(-1);
+ info->keyread_buff_used=0;
+ }
+
+ /* Last used buffer is in info->keyread_buff */
+ nod_flag=_ma_test_if_nod(info->keyread_buff);
+
+ if (nextflag & SEARCH_BIGGER) /* Next key */
+ {
+ my_off_t tmp_pos= _ma_kpos(nod_flag,info->int_keypos);
+ if (tmp_pos != HA_OFFSET_ERROR)
+ {
+ if ((error= _ma_search(info,keyinfo,key, USE_WHOLE_KEY,
+ nextflag | SEARCH_SAVE_BUFF, tmp_pos)) <=0)
+ DBUG_RETURN(error);
+ }
+ memcpy(lastkey,key,key_length);
+ if (!(info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag,
+ &info->int_keypos,lastkey)))
+ DBUG_RETURN(-1);
+ }
+ else /* Previous key */
+ {
+ uint length;
+ /* Find start of previous key */
+ info->int_keypos= _ma_get_last_key(info,keyinfo,info->keyread_buff,lastkey,
+ info->int_keypos, &length);
+ if (!info->int_keypos)
+ DBUG_RETURN(-1);
+ if (info->int_keypos == info->keyread_buff+2)
+ DBUG_RETURN(_ma_search(info,keyinfo,key, USE_WHOLE_KEY,
+ nextflag | SEARCH_SAVE_BUFF, pos));
+ if ((error= _ma_search(info,keyinfo,key, USE_WHOLE_KEY,
+ nextflag | SEARCH_SAVE_BUFF,
+ _ma_kpos(nod_flag,info->int_keypos))) <= 0)
+ DBUG_RETURN(error);
+
+ /* QQ: We should be able to optimize away the following call */
+ if (! _ma_get_last_key(info,keyinfo,info->keyread_buff,lastkey,
+ info->int_keypos,&info->lastkey_length))
+ DBUG_RETURN(-1);
+ }
+ memcpy(info->lastkey,lastkey,info->lastkey_length);
+ info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length);
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_next */
+
+
+ /* Search after position for the first row in an index */
+ /* This is stored in info->cur_row.lastpos */
+
+int _ma_search_first(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ register my_off_t pos)
+{
+ uint nod_flag;
+ uchar *page;
+ DBUG_ENTER("_ma_search_first");
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+
+ do
+ {
+ if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->keyread_buff,0))
+ {
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+ nod_flag=_ma_test_if_nod(info->keyread_buff);
+ page=info->keyread_buff+2+nod_flag;
+ } while ((pos= _ma_kpos(nod_flag,page)) != HA_OFFSET_ERROR);
+
+ if (!(info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,
+ info->lastkey)))
+ DBUG_RETURN(-1); /* Crashed */
+
+ info->int_keypos=page; info->int_maxpos=info->keyread_buff+maria_data_on_page(info->keyread_buff)-1;
+ info->int_nod_flag=nod_flag;
+ info->int_keytree_version=keyinfo->version;
+ info->last_search_keypage=info->last_keypage;
+ info->page_changed=info->keyread_buff_used=0;
+ info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length);
+
+ DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_first */
+
+
+ /* Search after position for the last row in an index */
+ /* This is stored in info->cur_row.lastpos */
+
+int _ma_search_last(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ register my_off_t pos)
+{
+ uint nod_flag;
+ uchar *buff,*page;
+ DBUG_ENTER("_ma_search_last");
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+
+ buff=info->keyread_buff;
+ do
+ {
+ if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,buff,0))
+ {
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+ page= buff+maria_data_on_page(buff);
+ nod_flag=_ma_test_if_nod(buff);
+ } while ((pos= _ma_kpos(nod_flag,page)) != HA_OFFSET_ERROR);
+
+ if (!_ma_get_last_key(info,keyinfo,buff,info->lastkey,page,
+ &info->lastkey_length))
+ DBUG_RETURN(-1);
+ info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length);
+ info->int_keypos=info->int_maxpos=page;
+ info->int_nod_flag=nod_flag;
+ info->int_keytree_version=keyinfo->version;
+ info->last_search_keypage=info->last_keypage;
+ info->page_changed=info->keyread_buff_used=0;
+
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_last */
+
+
+
+/****************************************************************************
+**
+** Functions to store and pack a key in a page
+**
+** maria_calc_xx_key_length takes the following arguments:
+** nod_flag If nod: Length of nod-pointer
+** next_key Position to pos after the new key in buffer
+** org_key Key that was before the next key in buffer
+** prev_key Last key before current key
+** key Key that will be stored
+** s_temp Information how next key will be packed
+****************************************************************************/
+
+/* Static length key */
+
+int
+_ma_calc_static_key_length(MARIA_KEYDEF *keyinfo,uint nod_flag,
+ uchar *next_pos __attribute__((unused)),
+ uchar *org_key __attribute__((unused)),
+ uchar *prev_key __attribute__((unused)),
+ const uchar *key, MARIA_KEY_PARAM *s_temp)
+{
+ s_temp->key= key;
+ return (int) (s_temp->totlength=keyinfo->keylength+nod_flag);
+}
+
+/* Variable length key */
+
+int
+_ma_calc_var_key_length(MARIA_KEYDEF *keyinfo,uint nod_flag,
+ uchar *next_pos __attribute__((unused)),
+ uchar *org_key __attribute__((unused)),
+ uchar *prev_key __attribute__((unused)),
+ const uchar *key, MARIA_KEY_PARAM *s_temp)
+{
+ s_temp->key= key;
+ return (int) (s_temp->totlength= _ma_keylength(keyinfo,key)+nod_flag);
+}
+
+/*
+ length of key with a variable length first segment which is prefix
+ compressed (maria_chk reports 'packed + stripped')
+
+ Keys are compressed the following way:
+
+ If the max length of first key segment <= 127 bytes the prefix is
+ 1 uchar else it's 2 byte
+
+ prefix byte(s) The high bit is set if this is a prefix for the prev key
+ length Packed length if the previous was a prefix byte
+ [length] data bytes ('length' bytes)
+ next-key-seg Next key segments
+
+ If the first segment can have NULL:
+ The length is 0 for NULLS and 1+length for not null columns.
+
+*/
+
+int
+_ma_calc_var_pack_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar *next_key,
+ uchar *org_key, uchar *prev_key, const uchar *key,
+ MARIA_KEY_PARAM *s_temp)
+{
+ reg1 HA_KEYSEG *keyseg;
+ int length;
+ uint key_length,ref_length,org_key_length=0,
+ length_pack,new_key_length,diff_flag,pack_marker;
+ const uchar *start,*end,*key_end;
+ uchar *sort_order;
+ bool same_length;
+
+ length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0;
+ same_length=0; keyseg=keyinfo->seg;
+ key_length= _ma_keylength(keyinfo,key)+nod_flag;
+
+ sort_order=0;
+ if ((keyinfo->flag & HA_FULLTEXT) &&
+ ((keyseg->type == HA_KEYTYPE_TEXT) ||
+ (keyseg->type == HA_KEYTYPE_VARTEXT1) ||
+ (keyseg->type == HA_KEYTYPE_VARTEXT2)) &&
+ !use_strnxfrm(keyseg->charset))
+ sort_order= keyseg->charset->sort_order;
+
+ /* diff flag contains how many bytes is needed to pack key */
+ if (keyseg->length >= 127)
+ {
+ diff_flag=2;
+ pack_marker=32768;
+ }
+ else
+ {
+ diff_flag= 1;
+ pack_marker=128;
+ }
+ s_temp->pack_marker=pack_marker;
+
+ /* Handle the case that the first part have NULL values */
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!*key++)
+ {
+ s_temp->key= key;
+ s_temp->key_length= 0;
+ s_temp->totlength= key_length-1+diff_flag;
+ s_temp->next_key_pos= 0; /* No next key */
+ return (s_temp->totlength);
+ }
+ s_temp->store_not_null=1;
+ key_length--; /* We don't store NULL */
+ if (prev_key && !*prev_key++)
+ org_key=prev_key=0; /* Can't pack against prev */
+ else if (org_key)
+ org_key++; /* Skip NULL */
+ }
+ else
+ s_temp->store_not_null=0;
+ s_temp->prev_key= org_key;
+
+ /* The key part will start with a packed length */
+
+ get_key_pack_length(new_key_length,length_pack,key);
+ end= key_end= key+ new_key_length;
+ start= key;
+
+ /* Calc how many characters are identical between this and the prev. key */
+ if (prev_key)
+ {
+ get_key_length(org_key_length,prev_key);
+ s_temp->prev_key=prev_key; /* Pointer at data */
+ /* Don't use key-pack if length == 0 */
+ if (new_key_length && new_key_length == org_key_length)
+ same_length=1;
+ else if (new_key_length > org_key_length)
+ end= key + org_key_length;
+
+ if (sort_order) /* SerG */
+ {
+ while (key < end &&
+ sort_order[* (uchar*) key] == sort_order[* (uchar*) prev_key])
+ {
+ key++; prev_key++;
+ }
+ }
+ else
+ {
+ while (key < end && *key == *prev_key)
+ {
+ key++; prev_key++;
+ }
+ }
+ }
+
+ s_temp->key=key;
+ s_temp->key_length= (uint) (key_end-key);
+
+ if (same_length && key == key_end)
+ {
+ /* identical variable length key */
+ s_temp->ref_length= pack_marker;
+ length=(int) key_length-(int) (key_end-start)-length_pack;
+ length+= diff_flag;
+ if (next_key)
+ { /* Can't combine with next */
+ s_temp->n_length= *next_key; /* Needed by _ma_store_key */
+ next_key=0;
+ }
+ }
+ else
+ {
+ if (start != key)
+ { /* Starts as prev key */
+ ref_length= (uint) (key-start);
+ s_temp->ref_length= ref_length + pack_marker;
+ length= (int) (key_length - ref_length);
+
+ length-= length_pack;
+ length+= diff_flag;
+ length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */
+ }
+ else
+ {
+ s_temp->key_length+=s_temp->store_not_null; /* If null */
+ length= key_length - length_pack+ diff_flag;
+ }
+ }
+ s_temp->totlength=(uint) length;
+ s_temp->prev_length=0;
+ DBUG_PRINT("test",("tot_length: %u length: %d uniq_key_length: %u",
+ key_length, length, s_temp->key_length));
+
+ /* If something after that hasn't length=0, test if we can combine */
+ if ((s_temp->next_key_pos=next_key))
+ {
+ uint packed,n_length;
+
+ packed = *next_key & 128;
+ if (diff_flag == 2)
+ {
+ n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */
+ next_key+=2;
+ }
+ else
+ n_length= *next_key++ & 127;
+ if (!packed)
+ n_length-= s_temp->store_not_null;
+
+ if (n_length || packed) /* Don't pack 0 length keys */
+ {
+ uint next_length_pack, new_ref_length=s_temp->ref_length;
+
+ if (packed)
+ {
+ /* If first key and next key is packed (only on delete) */
+ if (!prev_key && org_key)
+ {
+ get_key_length(org_key_length,org_key);
+ key=start;
+ if (sort_order) /* SerG */
+ {
+ while (key < end &&
+ sort_order[*(uchar*) key] == sort_order[*(uchar*) org_key])
+ {
+ key++; org_key++;
+ }
+ }
+ else
+ {
+ while (key < end && *key == *org_key)
+ {
+ key++; org_key++;
+ }
+ }
+ if ((new_ref_length= (uint) (key - start)))
+ new_ref_length+=pack_marker;
+ }
+
+ if (!n_length)
+ {
+ /*
+ We put a different key between two identical variable length keys
+ Extend next key to have same prefix as this key
+ */
+ if (new_ref_length) /* prefix of previus key */
+ { /* make next key longer */
+ s_temp->part_of_prev_key= new_ref_length;
+ s_temp->prev_length= org_key_length -
+ (new_ref_length-pack_marker);
+ s_temp->n_ref_length= s_temp->part_of_prev_key;
+ s_temp->n_length= s_temp->prev_length;
+ n_length= get_pack_length(s_temp->prev_length);
+ s_temp->prev_key+= (new_ref_length - pack_marker);
+ length+= s_temp->prev_length + n_length;
+ }
+ else
+ { /* Can't use prev key */
+ s_temp->part_of_prev_key=0;
+ s_temp->prev_length= org_key_length;
+ s_temp->n_ref_length=s_temp->n_length= org_key_length;
+ length+= org_key_length;
+ }
+ return (int) length;
+ }
+
+ ref_length=n_length;
+ /* Get information about not packed key suffix */
+ get_key_pack_length(n_length,next_length_pack,next_key);
+
+ /* Test if new keys has fewer characters that match the previous key */
+ if (!new_ref_length)
+ { /* Can't use prev key */
+ s_temp->part_of_prev_key= 0;
+ s_temp->prev_length= ref_length;
+ s_temp->n_ref_length= s_temp->n_length= n_length+ref_length;
+ return (int) length+ref_length-next_length_pack;
+ }
+ if (ref_length+pack_marker > new_ref_length)
+ {
+ uint new_pack_length=new_ref_length-pack_marker;
+ /* We must copy characters from the original key to the next key */
+ s_temp->part_of_prev_key= new_ref_length;
+ s_temp->prev_length= ref_length - new_pack_length;
+ s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length;
+ s_temp->prev_key+= new_pack_length;
+ length-= (next_length_pack - get_pack_length(s_temp->n_length));
+ return (int) length + s_temp->prev_length;
+ }
+ }
+ else
+ {
+ /* Next key wasn't a prefix of previous key */
+ ref_length=0;
+ next_length_pack=0;
+ }
+ DBUG_PRINT("test",("length: %d next_key: 0x%lx", length,
+ (long) next_key));
+
+ {
+ uint tmp_length;
+ key=(start+=ref_length);
+ if (key+n_length < key_end) /* Normalize length based */
+ key_end= key+n_length;
+ if (sort_order) /* SerG */
+ {
+ while (key < key_end &&
+ sort_order[*(uchar*) key] == sort_order[*(uchar*) next_key])
+ {
+ key++; next_key++;
+ }
+ }
+ else
+ {
+ while (key < key_end && *key == *next_key)
+ {
+ key++; next_key++;
+ }
+ }
+ if (!(tmp_length=(uint) (key-start)))
+ { /* Key can't be re-packed */
+ s_temp->next_key_pos=0;
+ return length;
+ }
+ ref_length+=tmp_length;
+ n_length-=tmp_length;
+ length-=tmp_length+next_length_pack; /* We gained these chars */
+ }
+ if (n_length == 0 && ref_length == new_key_length)
+ {
+ s_temp->n_ref_length=pack_marker; /* Same as prev key */
+ }
+ else
+ {
+ s_temp->n_ref_length=ref_length | pack_marker;
+ length+= get_pack_length(n_length);
+ s_temp->n_length=n_length;
+ }
+ }
+ }
+ return length;
+}
+
+
+/* Length of key which is prefix compressed */
+
+int _ma_calc_bin_pack_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar *next_key,
+ uchar *org_key, uchar *prev_key,
+ const uchar *key,
+ MARIA_KEY_PARAM *s_temp)
+{
+ uint length,key_length,ref_length;
+
+ s_temp->totlength=key_length= _ma_keylength(keyinfo,key)+nod_flag;
+#ifdef HAVE_purify
+ s_temp->n_length= s_temp->n_ref_length=0; /* For valgrind */
+#endif
+ s_temp->key=key;
+ s_temp->prev_key=org_key;
+ if (prev_key) /* If not first key in block */
+ {
+ /* pack key against previous key */
+ /*
+ As keys may be identical when running a sort in maria_chk, we
+ have to guard against the case where keys may be identical
+ */
+ const uchar *end;
+ end=key+key_length;
+ for ( ; *key == *prev_key && key < end; key++,prev_key++) ;
+ s_temp->ref_length= ref_length=(uint) (key-s_temp->key);
+ length=key_length - ref_length + get_pack_length(ref_length);
+ }
+ else
+ {
+ /* No previous key */
+ s_temp->ref_length=ref_length=0;
+ length=key_length+1;
+ }
+ if ((s_temp->next_key_pos=next_key)) /* If another key after */
+ {
+ /* pack key against next key */
+ uint next_length,next_length_pack;
+ get_key_pack_length(next_length,next_length_pack,next_key);
+
+ /* If first key and next key is packed (only on delete) */
+ if (!prev_key && org_key && next_length)
+ {
+ const uchar *end;
+ for (key= s_temp->key, end=key+next_length ;
+ *key == *org_key && key < end;
+ key++,org_key++) ;
+ ref_length= (uint) (key - s_temp->key);
+ }
+
+ if (next_length > ref_length)
+ {
+ /* We put a key with different case between two keys with the same prefix
+ Extend next key to have same prefix as
+ this key */
+ s_temp->n_ref_length= ref_length;
+ s_temp->prev_length= next_length-ref_length;
+ s_temp->prev_key+= ref_length;
+ return (int) (length+ s_temp->prev_length - next_length_pack +
+ get_pack_length(ref_length));
+ }
+ /* Check how many characters are identical to next key */
+ key= s_temp->key+next_length;
+ while (*key++ == *next_key++) ;
+ if ((ref_length= (uint) (key - s_temp->key)-1) == next_length)
+ {
+ s_temp->next_key_pos=0;
+ return length; /* can't pack next key */
+ }
+ s_temp->prev_length=0;
+ s_temp->n_ref_length=ref_length;
+ return (int) (length-(ref_length - next_length) - next_length_pack +
+ get_pack_length(ref_length));
+ }
+ return (int) length;
+}
+
+
+/*
+** store a key packed with _ma_calc_xxx_key_length in page-buffert
+*/
+
+/* store key without compression */
+
+void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ memcpy((uchar*) key_pos,(uchar*) s_temp->key,(size_t) s_temp->totlength);
+}
+
+
+/* store variable length key with prefix compression */
+
+#define store_pack_length(test,pos,length) { \
+ if (test) { *((pos)++) = (uchar) (length); } else \
+ { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length); } }
+
+
+void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ uint length;
+ uchar *start;
+
+ start=key_pos;
+
+ if (s_temp->ref_length)
+ {
+ /* Packed against previous key */
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length);
+ /* If not same key after */
+ if (s_temp->ref_length != s_temp->pack_marker)
+ store_key_length_inc(key_pos,s_temp->key_length);
+ }
+ else
+ {
+ /* Not packed against previous key */
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length);
+ }
+ bmove((uchar*) key_pos,(uchar*) s_temp->key,
+ (length=s_temp->totlength-(uint) (key_pos-start)));
+
+ if (!s_temp->next_key_pos) /* No following key */
+ return;
+ key_pos+=length;
+
+ if (s_temp->prev_length)
+ {
+ /* Extend next key because new key didn't have same prefix as prev key */
+ if (s_temp->part_of_prev_key)
+ {
+ store_pack_length(s_temp->pack_marker == 128,key_pos,
+ s_temp->part_of_prev_key);
+ store_key_length_inc(key_pos,s_temp->n_length);
+ }
+ else
+ {
+ s_temp->n_length+= s_temp->store_not_null;
+ store_pack_length(s_temp->pack_marker == 128,key_pos,
+ s_temp->n_length);
+ }
+ memcpy(key_pos, s_temp->prev_key, s_temp->prev_length);
+ }
+ else if (s_temp->n_ref_length)
+ {
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length);
+ if (s_temp->n_ref_length == s_temp->pack_marker)
+ return; /* Identical key */
+ store_key_length(key_pos,s_temp->n_length);
+ }
+ else
+ {
+ s_temp->n_length+= s_temp->store_not_null;
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length);
+ }
+}
+
+
+/* variable length key with prefix compression */
+
+void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ store_key_length_inc(key_pos,s_temp->ref_length);
+ memcpy((char*) key_pos,(char*) s_temp->key+s_temp->ref_length,
+ (size_t) s_temp->totlength-s_temp->ref_length);
+
+ if (s_temp->next_key_pos)
+ {
+ key_pos+=(uint) (s_temp->totlength-s_temp->ref_length);
+ store_key_length_inc(key_pos,s_temp->n_ref_length);
+ if (s_temp->prev_length) /* If we must extend key */
+ {
+ memcpy(key_pos,s_temp->prev_key,s_temp->prev_length);
+ }
+ }
+}
diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c
new file mode 100644
index 00000000000..2851a3a09dd
--- /dev/null
+++ b/storage/maria/ma_sort.c
@@ -0,0 +1,1058 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Creates a index for a database by reading keys, sorting them and outputing
+ them in sorted order through MARIA_SORT_INFO functions.
+*/
+
+#include "ma_fulltext.h"
+#if defined(MSDOS) || defined(__WIN__)
+#include <fcntl.h>
+#else
+#include <stddef.h>
+#endif
+#include <queues.h>
+
+/* static variables */
+
+#undef MIN_SORT_MEMORY
+#undef MYF_RW
+#undef DISK_BUFFER_SIZE
+
+#define MERGEBUFF 15
+#define MERGEBUFF2 31
+#define MIN_SORT_MEMORY (4096-MALLOC_OVERHEAD)
+#define MYF_RW MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL)
+#define DISK_BUFFER_SIZE (IO_SIZE*16)
+
+
+/*
+ Pointers of functions for store and read keys from temp file
+*/
+
+extern void print_error _VARARGS((const char *fmt,...));
+
+/* Functions defined in this file */
+
+static ha_rows NEAR_F find_all_keys(MARIA_SORT_PARAM *info,uint keys,
+ uchar **sort_keys,
+ DYNAMIC_ARRAY *buffpek,int *maxbuffer,
+ IO_CACHE *tempfile,
+ IO_CACHE *tempfile_for_exceptions);
+static int NEAR_F write_keys(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,IO_CACHE *tempfile);
+static int NEAR_F write_key(MARIA_SORT_PARAM *info, uchar *key,
+ IO_CACHE *tempfile);
+static int NEAR_F write_index(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count);
+static int NEAR_F merge_many_buff(MARIA_SORT_PARAM *info,uint keys,
+ uchar **sort_keys,
+ BUFFPEK *buffpek,int *maxbuffer,
+ IO_CACHE *t_file);
+static uint NEAR_F read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek,
+ uint sort_length);
+static int NEAR_F merge_buffers(MARIA_SORT_PARAM *info,uint keys,
+ IO_CACHE *from_file, IO_CACHE *to_file,
+ uchar **sort_keys, BUFFPEK *lastbuff,
+ BUFFPEK *Fb, BUFFPEK *Tb);
+static int NEAR_F merge_index(MARIA_SORT_PARAM *,uint, uchar **,BUFFPEK *, int,
+ IO_CACHE *);
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info);
+
+static int NEAR_F write_keys_varlen(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,
+ IO_CACHE *tempfile);
+static uint NEAR_F read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek,
+ uint sort_length);
+static int NEAR_F write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file,
+ char *key, uint sort_length, uint count);
+static int NEAR_F write_merge_key_varlen(MARIA_SORT_PARAM *info,
+ IO_CACHE *to_file,
+ char* key, uint sort_length,
+ uint count);
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs);
+
+/*
+ Creates a index of sorted keys
+
+ SYNOPSIS
+ _ma_create_index_by_sort()
+ info Sort parameters
+ no_messages Set to 1 if no output
+ sortbuff_size Size if sortbuffer to allocate
+
+ RESULT
+ 0 ok
+ <> 0 Error
+*/
+
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+ ulong sortbuff_size)
+{
+ int error,maxbuffer,skr;
+ uint memavl,old_memavl,keys,sort_length;
+ DYNAMIC_ARRAY buffpek;
+ ha_rows records;
+ uchar **sort_keys;
+ IO_CACHE tempfile, tempfile_for_exceptions;
+ DBUG_ENTER("_ma_create_index_by_sort");
+ DBUG_PRINT("enter",("sort_length: %d", info->key_length));
+
+ if (info->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ info->write_keys= write_keys_varlen;
+ info->read_to_buffer=read_to_buffer_varlen;
+ info->write_key=write_merge_key_varlen;
+ }
+ else
+ {
+ info->write_keys= write_keys;
+ info->read_to_buffer=read_to_buffer;
+ info->write_key=write_merge_key;
+ }
+
+ my_b_clear(&tempfile);
+ my_b_clear(&tempfile_for_exceptions);
+ bzero((char*) &buffpek,sizeof(buffpek));
+ sort_keys= (uchar **) NULL; error= 1;
+ maxbuffer=1;
+
+ memavl=max(sortbuff_size,MIN_SORT_MEMORY);
+ records= info->sort_info->max_records;
+ sort_length= info->key_length;
+ LINT_INIT(keys);
+
+ while (memavl >= MIN_SORT_MEMORY)
+ {
+ if ((records < UINT_MAX32) &&
+ ((my_off_t) (records + 1) *
+ (sort_length + sizeof(char*)) <= (my_off_t) memavl))
+ keys= records+1;
+ else
+ do
+ {
+ skr=maxbuffer;
+ if (memavl < sizeof(BUFFPEK)*(uint) maxbuffer ||
+ (keys=(memavl-sizeof(BUFFPEK)*(uint) maxbuffer)/
+ (sort_length+sizeof(char*))) <= 1 ||
+ keys < (uint) maxbuffer)
+ {
+ _ma_check_print_error(info->sort_info->param,
+ "maria_sort_buffer_size is too small");
+ goto err;
+ }
+ }
+ while ((maxbuffer= (int) (records/(keys-1)+1)) != skr);
+
+ if ((sort_keys=(uchar**) my_malloc(keys*(sort_length+sizeof(char*))+
+ HA_FT_MAXBYTELEN, MYF(0))))
+ {
+ if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer,
+ maxbuffer/2))
+ {
+ my_free((uchar*) sort_keys,MYF(0));
+ sort_keys= 0;
+ }
+ else
+ break;
+ }
+ old_memavl=memavl;
+ if ((memavl=memavl/4*3) < MIN_SORT_MEMORY && old_memavl > MIN_SORT_MEMORY)
+ memavl=MIN_SORT_MEMORY;
+ }
+ if (memavl < MIN_SORT_MEMORY)
+ {
+ _ma_check_print_error(info->sort_info->param, "Maria sort buffer"
+ " too small"); /* purecov: tested */
+ goto err; /* purecov: tested */
+ }
+ (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */
+
+ if (!no_messages)
+ printf(" - Searching for keys, allocating buffer for %d keys\n",keys);
+
+ if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer,
+ &tempfile,&tempfile_for_exceptions))
+ == HA_POS_ERROR)
+ goto err; /* purecov: tested */
+ if (maxbuffer == 0)
+ {
+ if (!no_messages)
+ printf(" - Dumping %lu keys\n", (ulong) records);
+ if (write_index(info,sort_keys, (uint) records))
+ goto err; /* purecov: inspected */
+ }
+ else
+ {
+ keys=(keys*(sort_length+sizeof(char*)))/sort_length;
+ if (maxbuffer >= MERGEBUFF2)
+ {
+ if (!no_messages)
+ printf(" - Merging %lu keys\n", (ulong) records); /* purecov: tested */
+ if (merge_many_buff(info,keys,sort_keys,
+ dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile))
+ goto err; /* purecov: inspected */
+ }
+ if (flush_io_cache(&tempfile) ||
+ reinit_io_cache(&tempfile,READ_CACHE,0L,0,0))
+ goto err; /* purecov: inspected */
+ if (!no_messages)
+ printf(" - Last merge and dumping keys\n"); /* purecov: tested */
+ if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *),
+ maxbuffer,&tempfile))
+ goto err; /* purecov: inspected */
+ }
+
+ if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info))
+ goto err;
+
+ if (my_b_inited(&tempfile_for_exceptions))
+ {
+ MARIA_HA *idx=info->sort_info->info;
+ uint keyno=info->key;
+ uint key_length, ref_length=idx->s->rec_reflength;
+
+ if (!no_messages)
+ printf(" - Adding exceptions\n"); /* purecov: tested */
+ if (flush_io_cache(&tempfile_for_exceptions) ||
+ reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0))
+ goto err;
+
+ while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length,
+ sizeof(key_length))
+ && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys,
+ (uint) key_length))
+ {
+ if (_ma_ck_write(idx,keyno,(uchar*) sort_keys,key_length-ref_length))
+ goto err;
+ }
+ }
+
+ error =0;
+
+err:
+ if (sort_keys)
+ my_free((uchar*) sort_keys,MYF(0));
+ delete_dynamic(&buffpek);
+ close_cached_file(&tempfile);
+ close_cached_file(&tempfile_for_exceptions);
+
+ DBUG_RETURN(error ? -1 : 0);
+} /* _ma_create_index_by_sort */
+
+
+/* Search after all keys and place them in a temp. file */
+
+static ha_rows NEAR_F find_all_keys(MARIA_SORT_PARAM *info, uint keys,
+ uchar **sort_keys, DYNAMIC_ARRAY *buffpek,
+ int *maxbuffer, IO_CACHE *tempfile,
+ IO_CACHE *tempfile_for_exceptions)
+{
+ int error;
+ uint idx;
+ DBUG_ENTER("find_all_keys");
+
+ idx=error=0;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+
+ while (!(error=(*info->key_read)(info,sort_keys[idx])))
+ {
+ if (info->real_key_length > info->key_length)
+ {
+ if (write_key(info,sort_keys[idx],tempfile_for_exceptions))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+ continue;
+ }
+
+ if (++idx == keys)
+ {
+ if (info->write_keys(info,sort_keys,idx-1,
+ (BUFFPEK *)alloc_dynamic(buffpek),
+ tempfile))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+
+ sort_keys[0]=(uchar*) (sort_keys+keys);
+ memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length);
+ idx=1;
+ }
+ sort_keys[idx]=sort_keys[idx-1]+info->key_length;
+ }
+ if (error > 0)
+ DBUG_RETURN(HA_POS_ERROR); /* Aborted by get_key */ /* purecov: inspected */
+ if (buffpek->elements)
+ {
+ if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek),
+ tempfile))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+ *maxbuffer=buffpek->elements-1;
+ }
+ else
+ *maxbuffer=0;
+
+ DBUG_RETURN((*maxbuffer)*(keys-1)+idx);
+} /* find_all_keys */
+
+
+#ifdef THREAD
+/* Search after all keys and place them in a temp. file */
+
+pthread_handler_t _ma_thr_find_all_keys(void *arg)
+{
+ MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg;
+ int error;
+ uint memavl,old_memavl,keys,sort_length;
+ uint idx, maxbuffer;
+ uchar **sort_keys=0;
+
+ LINT_INIT(keys);
+
+ error=1;
+
+ if (my_thread_init())
+ goto err;
+
+ { /* Add extra block since DBUG_ENTER declare variables */
+ DBUG_ENTER("_ma_thr_find_all_keys");
+ DBUG_PRINT("enter", ("master: %d", sort_param->master));
+ if (sort_param->sort_info->got_error)
+ goto err;
+
+ if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ sort_param->write_keys= write_keys_varlen;
+ sort_param->read_to_buffer= read_to_buffer_varlen;
+ sort_param->write_key= write_merge_key_varlen;
+ }
+ else
+ {
+ sort_param->write_keys= write_keys;
+ sort_param->read_to_buffer= read_to_buffer;
+ sort_param->write_key= write_merge_key;
+ }
+
+ my_b_clear(&sort_param->tempfile);
+ my_b_clear(&sort_param->tempfile_for_exceptions);
+ bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek));
+ bzero((char*) &sort_param->unique, sizeof(sort_param->unique));
+
+ memavl= max(sort_param->sortbuff_size, MIN_SORT_MEMORY);
+ idx= sort_param->sort_info->max_records;
+ sort_length= sort_param->key_length;
+ maxbuffer= 1;
+
+ while (memavl >= MIN_SORT_MEMORY)
+ {
+ if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <=
+ (my_off_t) memavl)
+ keys= idx+1;
+ else
+ {
+ uint skr;
+ do
+ {
+ skr= maxbuffer;
+ if (memavl < sizeof(BUFFPEK)*maxbuffer ||
+ (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/
+ (sort_length+sizeof(char*))) <= 1 ||
+ keys < (uint) maxbuffer)
+ {
+ _ma_check_print_error(sort_param->sort_info->param,
+ "maria_sort_buffer_size is too small");
+ goto err;
+ }
+ }
+ while ((maxbuffer= (int) (idx/(keys-1)+1)) != skr);
+ }
+ if ((sort_keys= (uchar **)
+ my_malloc(keys*(sort_length+sizeof(char*))+
+ ((sort_param->keyinfo->flag & HA_FULLTEXT) ?
+ HA_FT_MAXBYTELEN : 0), MYF(0))))
+ {
+ if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK),
+ maxbuffer, maxbuffer/2))
+ {
+ my_free((uchar*) sort_keys,MYF(0));
+ sort_keys= (uchar **) NULL; /* for err: label */
+ }
+ else
+ break;
+ }
+ old_memavl= memavl;
+ if ((memavl= memavl/4*3) < MIN_SORT_MEMORY &&
+ old_memavl > MIN_SORT_MEMORY)
+ memavl= MIN_SORT_MEMORY;
+ }
+ if (memavl < MIN_SORT_MEMORY)
+ {
+ _ma_check_print_error(sort_param->sort_info->param,
+ "Maria sort buffer too small");
+ goto err; /* purecov: tested */
+ }
+
+ if (sort_param->sort_info->param->testflag & T_VERBOSE)
+ printf("Key %d - Allocating buffer for %d keys\n",
+ sort_param->key+1, keys);
+ sort_param->sort_keys= sort_keys;
+
+ idx= error= 0;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+
+ DBUG_PRINT("info", ("reading keys"));
+ while (!(error= sort_param->sort_info->got_error) &&
+ !(error= (*sort_param->key_read)(sort_param, sort_keys[idx])))
+ {
+ if (sort_param->real_key_length > sort_param->key_length)
+ {
+ if (write_key(sort_param,sort_keys[idx],
+ &sort_param->tempfile_for_exceptions))
+ goto err;
+ continue;
+ }
+
+ if (++idx == keys)
+ {
+ if (sort_param->write_keys(sort_param, sort_keys, idx - 1,
+ (BUFFPEK *)alloc_dynamic(&sort_param->
+ buffpek),
+ &sort_param->tempfile))
+ goto err;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+ memcpy(sort_keys[0], sort_keys[idx - 1],
+ (size_t) sort_param->key_length);
+ idx= 1;
+ }
+ sort_keys[idx]=sort_keys[idx - 1] + sort_param->key_length;
+ }
+ if (error > 0)
+ goto err;
+ if (sort_param->buffpek.elements)
+ {
+ if (sort_param->write_keys(sort_param,sort_keys, idx,
+ (BUFFPEK *) alloc_dynamic(&sort_param->
+ buffpek),
+ &sort_param->tempfile))
+ goto err;
+ sort_param->keys= (sort_param->buffpek.elements - 1) * (keys - 1) + idx;
+ }
+ else
+ sort_param->keys= idx;
+
+ sort_param->sort_keys_length= keys;
+ goto ok;
+
+err:
+ DBUG_PRINT("error", ("got some error"));
+ sort_param->sort_info->got_error= 1; /* no need to protect with a mutex */
+ my_free((uchar*) sort_keys,MYF(MY_ALLOW_ZERO_PTR));
+ sort_param->sort_keys=0;
+ delete_dynamic(& sort_param->buffpek);
+ close_cached_file(&sort_param->tempfile);
+ close_cached_file(&sort_param->tempfile_for_exceptions);
+
+ok:
+ free_root(&sort_param->wordroot, MYF(0));
+ /*
+ Detach from the share if the writer is involved. Avoid others to
+ be blocked. This includes a flush of the write buffer. This will
+ also indicate EOF to the readers.
+ */
+ if (sort_param->sort_info->info->rec_cache.share)
+ remove_io_thread(&sort_param->sort_info->info->rec_cache);
+
+ /* Readers detach from the share if any. Avoid others to be blocked. */
+ if (sort_param->read_cache.share)
+ remove_io_thread(&sort_param->read_cache);
+
+ pthread_mutex_lock(&sort_param->sort_info->mutex);
+ if (!--sort_param->sort_info->threads_running)
+ pthread_cond_signal(&sort_param->sort_info->cond);
+ pthread_mutex_unlock(&sort_param->sort_info->mutex);
+ DBUG_PRINT("exit", ("======== ending thread ========"));
+ }
+ my_thread_end();
+ return NULL;
+}
+
+
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param)
+{
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ ulong length, keys;
+ ulong *rec_per_key_part=param->rec_per_key_part;
+ int got_error=sort_info->got_error;
+ uint i;
+ MARIA_HA *info=sort_info->info;
+ MARIA_SHARE *share=info->s;
+ MARIA_SORT_PARAM *sinfo;
+ uchar *mergebuf=0;
+ DBUG_ENTER("_ma_thr_write_keys");
+ LINT_INIT(length);
+
+ for (i= 0, sinfo= sort_param ;
+ i < sort_info->total_keys ;
+ i++, rec_per_key_part+=sinfo->keyinfo->keysegs, sinfo++)
+ {
+ if (!sinfo->sort_keys)
+ {
+ got_error=1;
+ my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ continue;
+ }
+ if (!got_error)
+ {
+ maria_set_key_active(share->state.key_map, sinfo->key);
+
+ if (!sinfo->buffpek.elements)
+ {
+ if (param->testflag & T_VERBOSE)
+ {
+ printf("Key %d - Dumping %u keys\n",sinfo->key+1, sinfo->keys);
+ fflush(stdout);
+ }
+ if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) ||
+ flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo))
+ got_error=1;
+ }
+ if (!got_error && param->testflag & T_STATISTICS)
+ maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique,
+ param->stats_method == MI_STATS_METHOD_IGNORE_NULLS?
+ sinfo->notnull: NULL,
+ (ulonglong) info->state->records);
+ }
+ my_free((uchar*) sinfo->sort_keys,MYF(0));
+ my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ sinfo->sort_keys=0;
+ }
+
+ for (i= 0, sinfo= sort_param ;
+ i < sort_info->total_keys ;
+ i++,
+ delete_dynamic(&sinfo->buffpek),
+ close_cached_file(&sinfo->tempfile),
+ close_cached_file(&sinfo->tempfile_for_exceptions),
+ sinfo++)
+ {
+ if (got_error)
+ continue;
+ if (sinfo->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ sinfo->write_keys=write_keys_varlen;
+ sinfo->read_to_buffer=read_to_buffer_varlen;
+ sinfo->write_key=write_merge_key_varlen;
+ }
+ else
+ {
+ sinfo->write_keys=write_keys;
+ sinfo->read_to_buffer=read_to_buffer;
+ sinfo->write_key=write_merge_key;
+ }
+ if (sinfo->buffpek.elements)
+ {
+ uint maxbuffer=sinfo->buffpek.elements-1;
+ if (!mergebuf)
+ {
+ length=param->sort_buffer_length;
+ while (length >= MIN_SORT_MEMORY && !mergebuf)
+ {
+ mergebuf=my_malloc(length, MYF(0));
+ length=length*3/4;
+ }
+ if (!mergebuf)
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ keys=length/sinfo->key_length;
+ if (maxbuffer >= MERGEBUFF2)
+ {
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Merging %u keys\n",sinfo->key+1, sinfo->keys);
+ if (merge_many_buff(sinfo, keys, (uchar **) mergebuf,
+ dynamic_element(&sinfo->buffpek, 0, BUFFPEK *),
+ (int*) &maxbuffer, &sinfo->tempfile))
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ if (flush_io_cache(&sinfo->tempfile) ||
+ reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0))
+ {
+ got_error=1;
+ continue;
+ }
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Last merge and dumping keys\n", sinfo->key+1);
+ if (merge_index(sinfo, keys, (uchar**) mergebuf,
+ dynamic_element(&sinfo->buffpek,0,BUFFPEK *),
+ maxbuffer,&sinfo->tempfile) ||
+ flush_maria_ft_buf(sinfo) ||
+ _ma_flush_pending_blocks(sinfo))
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ if (my_b_inited(&sinfo->tempfile_for_exceptions))
+ {
+ uint key_length;
+
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Dumping 'long' keys\n", sinfo->key+1);
+
+ if (flush_io_cache(&sinfo->tempfile_for_exceptions) ||
+ reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0))
+ {
+ got_error=1;
+ continue;
+ }
+
+ while (!got_error &&
+ !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length,
+ sizeof(key_length)))
+ {
+ uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10];
+ if (key_length > sizeof(maria_ft_buf) ||
+ my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf,
+ (uint)key_length) ||
+ _ma_ck_write(info, sinfo->key, maria_ft_buf,
+ key_length - info->s->rec_reflength))
+ got_error=1;
+ }
+ }
+ }
+ my_free((uchar*) mergebuf,MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(got_error);
+}
+#endif /* THREAD */
+
+/* Write all keys in memory to file for later merge */
+
+static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys,
+ uint count, BUFFPEK *buffpek, IO_CACHE *tempfile)
+{
+ uchar **end;
+ uint sort_length=info->key_length;
+ DBUG_ENTER("write_keys");
+
+ qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+ info);
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ buffpek->file_pos=my_b_tell(tempfile);
+ buffpek->count=count;
+
+ for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+ {
+ if (my_b_write(tempfile, *sort_keys, (uint) sort_length))
+ DBUG_RETURN(1); /* purecov: inspected */
+ }
+ DBUG_RETURN(0);
+} /* write_keys */
+
+
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs)
+{
+ int err;
+ uint16 len= _ma_keylength(info->keyinfo, bufs);
+
+ /* The following is safe as this is a local file */
+ if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len))))
+ return (err);
+ if ((err= my_b_write(to_file,bufs, (uint) len)))
+ return (err);
+ return (0);
+}
+
+
+static int NEAR_F write_keys_varlen(MARIA_SORT_PARAM *info,
+ register uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,
+ IO_CACHE *tempfile)
+{
+ uchar **end;
+ int err;
+ DBUG_ENTER("write_keys_varlen");
+
+ qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+ info);
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ buffpek->file_pos=my_b_tell(tempfile);
+ buffpek->count=count;
+ for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+ {
+ if ((err= my_var_write(info,tempfile, *sort_keys)))
+ DBUG_RETURN(err);
+ }
+ DBUG_RETURN(0);
+} /* write_keys_varlen */
+
+
+static int NEAR_F write_key(MARIA_SORT_PARAM *info, uchar *key,
+ IO_CACHE *tempfile)
+{
+ uint key_length=info->real_key_length;
+ DBUG_ENTER("write_key");
+
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1);
+
+ if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) ||
+ my_b_write(tempfile, key, (uint) key_length))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+} /* write_key */
+
+
+/* Write index */
+
+static int NEAR_F write_index(MARIA_SORT_PARAM *info,
+ register uchar **sort_keys,
+ register uint count)
+{
+ DBUG_ENTER("write_index");
+
+ qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*),
+ (qsort2_cmp) info->key_cmp,info);
+ while (count--)
+ {
+ if ((*info->key_write)(info, *sort_keys++))
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ DBUG_RETURN(0);
+} /* write_index */
+
+
+ /* Merge buffers to make < MERGEBUFF2 buffers */
+
+static int NEAR_F merge_many_buff(MARIA_SORT_PARAM *info, uint keys,
+ uchar **sort_keys, BUFFPEK *buffpek,
+ int *maxbuffer, IO_CACHE *t_file)
+{
+ register int i;
+ IO_CACHE t_file2, *from_file, *to_file, *temp;
+ BUFFPEK *lastbuff;
+ DBUG_ENTER("merge_many_buff");
+
+ if (*maxbuffer < MERGEBUFF2)
+ DBUG_RETURN(0); /* purecov: inspected */
+ if (flush_io_cache(t_file) ||
+ open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ from_file= t_file ; to_file= &t_file2;
+ while (*maxbuffer >= MERGEBUFF2)
+ {
+ reinit_io_cache(from_file,READ_CACHE,0L,0,0);
+ reinit_io_cache(to_file,WRITE_CACHE,0L,0,0);
+ lastbuff=buffpek;
+ for (i=0 ; i <= *maxbuffer-MERGEBUFF*3/2 ; i+=MERGEBUFF)
+ {
+ if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+ buffpek+i,buffpek+i+MERGEBUFF-1))
+ goto cleanup;
+ }
+ if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+ buffpek+i,buffpek+ *maxbuffer))
+ break; /* purecov: inspected */
+ if (flush_io_cache(to_file))
+ break; /* purecov: inspected */
+ temp=from_file; from_file=to_file; to_file=temp;
+ *maxbuffer= (int) (lastbuff-buffpek)-1;
+ }
+cleanup:
+ close_cached_file(to_file); /* This holds old result */
+ if (to_file == t_file)
+ *t_file=t_file2; /* Copy result file */
+
+ DBUG_RETURN(*maxbuffer >= MERGEBUFF2); /* Return 1 if interrupted */
+} /* merge_many_buff */
+
+
+/*
+ Read data to buffer
+
+ SYNOPSIS
+ read_to_buffer()
+ fromfile File to read from
+ buffpek Where to read from
+ sort_length max length to read
+ RESULT
+ > 0 Ammount of bytes read
+ -1 Error
+*/
+
+static uint NEAR_F read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek,
+ uint sort_length)
+{
+ register uint count;
+ uint length;
+
+ if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+ {
+ if (my_pread(fromfile->file,(uchar*) buffpek->base,
+ (length= sort_length*count),buffpek->file_pos,MYF_RW))
+ return((uint) -1); /* purecov: inspected */
+ buffpek->key=buffpek->base;
+ buffpek->file_pos+= length; /* New filepos */
+ buffpek->count-= count;
+ buffpek->mem_count= count;
+ }
+ return (count*sort_length);
+} /* read_to_buffer */
+
+static uint NEAR_F read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek,
+ uint sort_length)
+{
+ register uint count;
+ uint16 length_of_key = 0;
+ uint idx;
+ uchar *buffp;
+
+ if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+ {
+ buffp= buffpek->base;
+
+ for (idx=1;idx<=count;idx++)
+ {
+ if (my_pread(fromfile->file,(uchar*)&length_of_key,sizeof(length_of_key),
+ buffpek->file_pos,MYF_RW))
+ return((uint) -1);
+ buffpek->file_pos+=sizeof(length_of_key);
+ if (my_pread(fromfile->file,(uchar*) buffp,length_of_key,
+ buffpek->file_pos,MYF_RW))
+ return((uint) -1);
+ buffpek->file_pos+=length_of_key;
+ buffp = buffp + sort_length;
+ }
+ buffpek->key=buffpek->base;
+ buffpek->count-= count;
+ buffpek->mem_count= count;
+ }
+ return (count*sort_length);
+} /* read_to_buffer_varlen */
+
+
+static int NEAR_F write_merge_key_varlen(MARIA_SORT_PARAM *info,
+ IO_CACHE *to_file,char* key,
+ uint sort_length, uint count)
+{
+ uint idx;
+
+ char *bufs = key;
+ for (idx=1;idx<=count;idx++)
+ {
+ int err;
+ if ((err= my_var_write(info,to_file, (uchar*) bufs)))
+ return (err);
+ bufs=bufs+sort_length;
+ }
+ return(0);
+}
+
+
+static int NEAR_F write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)),
+ IO_CACHE *to_file, char* key,
+ uint sort_length, uint count)
+{
+ return my_b_write(to_file,(uchar*) key,(uint) sort_length*count);
+}
+
+/*
+ Merge buffers to one buffer
+ If to_file == 0 then use info->key_write
+*/
+
+static int NEAR_F
+merge_buffers(MARIA_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
+ IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff,
+ BUFFPEK *Fb, BUFFPEK *Tb)
+{
+ int error;
+ uint sort_length,maxcount;
+ ha_rows count;
+ my_off_t to_start_filepos;
+ uchar *strpos;
+ BUFFPEK *buffpek,**refpek;
+ QUEUE queue;
+ volatile int *killed= _ma_killed_ptr(info->sort_info->param);
+ DBUG_ENTER("merge_buffers");
+
+ count=error=0;
+ maxcount=keys/((uint) (Tb-Fb) +1);
+ LINT_INIT(to_start_filepos);
+ if (to_file)
+ to_start_filepos=my_b_tell(to_file);
+ strpos= (uchar*) sort_keys;
+ sort_length=info->key_length;
+
+ if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0,
+ (int (*)(void*, uchar *,uchar*)) info->key_cmp,
+ (void*) info))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
+ {
+ count+= buffpek->count;
+ buffpek->base= strpos;
+ buffpek->max_keys=maxcount;
+ strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek,
+ sort_length));
+ if (error == -1)
+ goto err; /* purecov: inspected */
+ queue_insert(&queue,(char*) buffpek);
+ }
+
+ while (queue.elements > 1)
+ {
+ for (;;)
+ {
+ if (*killed)
+ {
+ error=1; goto err;
+ }
+ buffpek=(BUFFPEK*) queue_top(&queue);
+ if (to_file)
+ {
+ if (info->write_key(info,to_file,(uchar*) buffpek->key,
+ (uint) sort_length,1))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ else
+ {
+ if ((*info->key_write)(info,(void*) buffpek->key))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ buffpek->key+=sort_length;
+ if (! --buffpek->mem_count)
+ {
+ if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length)))
+ {
+ uchar *base= buffpek->base;
+ uint max_keys=buffpek->max_keys;
+
+ VOID(queue_remove(&queue,0));
+
+ /* Put room used by buffer to use in other buffer */
+ for (refpek= (BUFFPEK**) &queue_top(&queue);
+ refpek <= (BUFFPEK**) &queue_end(&queue);
+ refpek++)
+ {
+ buffpek= *refpek;
+ if (buffpek->base+buffpek->max_keys*sort_length == base)
+ {
+ buffpek->max_keys+=max_keys;
+ break;
+ }
+ else if (base+max_keys*sort_length == buffpek->base)
+ {
+ buffpek->base=base;
+ buffpek->max_keys+=max_keys;
+ break;
+ }
+ }
+ break; /* One buffer have been removed */
+ }
+ }
+ else if (error == -1)
+ goto err; /* purecov: inspected */
+ queue_replaced(&queue); /* Top element has been replaced */
+ }
+ }
+ buffpek=(BUFFPEK*) queue_top(&queue);
+ buffpek->base= (uchar*) sort_keys;
+ buffpek->max_keys=keys;
+ do
+ {
+ if (to_file)
+ {
+ if (info->write_key(info,to_file,(uchar*) buffpek->key,
+ sort_length,buffpek->mem_count))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ else
+ {
+ register uchar *end;
+ strpos= buffpek->key;
+ for (end= strpos+buffpek->mem_count*sort_length;
+ strpos != end ;
+ strpos+=sort_length)
+ {
+ if ((*info->key_write)(info, (uchar*) strpos))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ }
+ }
+ while ((error=(int) info->read_to_buffer(from_file,buffpek,sort_length)) !=
+ -1 && error != 0);
+
+ lastbuff->count=count;
+ if (to_file)
+ lastbuff->file_pos=to_start_filepos;
+err:
+ delete_queue(&queue);
+ DBUG_RETURN(error);
+} /* merge_buffers */
+
+
+ /* Do a merge to output-file (save only positions) */
+
+static int NEAR_F
+merge_index(MARIA_SORT_PARAM *info, uint keys, uchar **sort_keys,
+ BUFFPEK *buffpek, int maxbuffer, IO_CACHE *tempfile)
+{
+ DBUG_ENTER("merge_index");
+ if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek,
+ buffpek+maxbuffer))
+ DBUG_RETURN(1); /* purecov: inspected */
+ DBUG_RETURN(0);
+} /* merge_index */
+
+
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info)
+{
+ int err=0;
+ if (info->sort_info->ft_buf)
+ {
+ err=_ma_sort_ft_buf_flush(info);
+ my_free((uchar*)info->sort_info->ft_buf, MYF(0));
+ info->sort_info->ft_buf=0;
+ }
+ return err;
+}
+
diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h
new file mode 100644
index 00000000000..a70695bea3a
--- /dev/null
+++ b/storage/maria/ma_sp_defs.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _SP_DEFS_H
+#define _SP_DEFS_H
+
+#define SPDIMS 2
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN 8
+
+#ifdef HAVE_SPATIAL
+
+enum wkbType
+{
+ wkbPoint = 1,
+ wkbLineString = 2,
+ wkbPolygon = 3,
+ wkbMultiPoint = 4,
+ wkbMultiLineString = 5,
+ wkbMultiPolygon = 6,
+ wkbGeometryCollection = 7
+};
+
+enum wkbByteOrder
+{
+ wkbXDR = 0, /* Big Endian */
+ wkbNDR = 1 /* Little Endian */
+};
+
+uint _ma_sp_make_key(register MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *record, my_off_t filepos);
+
+#endif /*HAVE_SPATIAL*/
+#endif /* _SP_DEFS_H */
diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c
new file mode 100644
index 00000000000..1ea9b410ab6
--- /dev/null
+++ b/storage/maria/ma_sp_key.c
@@ -0,0 +1,299 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_SPATIAL
+
+#include "ma_sp_defs.h"
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ double *mbr, int top);
+static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr);
+
+static void get_double(double *d, const uchar *pos)
+{
+ float8get(*d, pos);
+}
+
+uint _ma_sp_make_key(register MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *record, my_off_t filepos)
+{
+ HA_KEYSEG *keyseg;
+ MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr];
+ uint len = 0;
+ uchar *pos;
+ uint dlen;
+ uchar *dptr;
+ double mbr[SPDIMS * 2];
+ uint i;
+
+ keyseg = &keyinfo->seg[-1];
+ pos = (uchar*)record + keyseg->start;
+
+ dlen = _ma_calc_blob_length(keyseg->bit_start, pos);
+ memcpy_fixed(&dptr, pos + keyseg->bit_start, sizeof(char*));
+ if (!dptr)
+ {
+ my_errno= HA_ERR_NULL_IN_SPATIAL;
+ return 0;
+ }
+ sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr); /* SRID */
+
+ for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++)
+ {
+ uint length = keyseg->length;
+
+ pos = ((uchar*)mbr) + keyseg->start;
+ if (keyseg->flag & HA_SWAP_KEY)
+ {
+#ifdef HAVE_ISNAN
+ if (keyseg->type == HA_KEYTYPE_FLOAT)
+ {
+ float nr;
+ float4get(nr, pos);
+ if (isnan(nr))
+ {
+ /* Replace NAN with zero */
+ bzero(key, length);
+ key+= length;
+ continue;
+ }
+ }
+ else if (keyseg->type == HA_KEYTYPE_DOUBLE)
+ {
+ double nr;
+ get_double(&nr, pos);
+ if (isnan(nr))
+ {
+ bzero(key, length);
+ key+= length;
+ continue;
+ }
+ }
+#endif
+ pos += length;
+ while (length--)
+ {
+ *key++ = *--pos;
+ }
+ }
+ else
+ {
+ memcpy((uchar*)key, pos, length);
+ key += keyseg->length;
+ }
+ len += keyseg->length;
+ }
+ _ma_dpointer(info, key, filepos);
+ return len;
+}
+
+/*
+Calculate minimal bounding rectangle (mbr) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+*/
+static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr)
+{
+ uint i;
+
+ for (i=0; i < n_dims; ++i)
+ {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+
+ return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1);
+}
+
+/*
+ Add one point stored in wkb to mbr
+*/
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order __attribute__((unused)),
+ double *mbr)
+{
+ double ord;
+ double *mbr_end= mbr + n_dims * 2;
+
+ while (mbr < mbr_end)
+ {
+ if ((*wkb) > end - 8)
+ return -1;
+ get_double(&ord, (const uchar*) *wkb);
+ (*wkb)+= 8;
+ if (ord < *mbr)
+ float8store((char*) mbr, ord);
+ mbr++;
+ if (ord > *mbr)
+ float8store((char*) mbr, ord);
+ mbr++;
+ }
+ return 0;
+}
+
+
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr);
+}
+
+
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ uint n_points;
+
+ n_points = uint4korr(*wkb);
+ (*wkb) += 4;
+ for (; n_points > 0; --n_points)
+ {
+ /* Add next point to mbr */
+ if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ return 0;
+}
+
+
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ uint n_linear_rings;
+ uint n_points;
+
+ n_linear_rings = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ for (; n_linear_rings > 0; --n_linear_rings)
+ {
+ n_points = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_points > 0; --n_points)
+ {
+ /* Add next point to mbr */
+ if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ double *mbr, int top)
+{
+ int res;
+ uchar byte_order;
+ uint wkb_type;
+
+ byte_order = *(*wkb);
+ ++(*wkb);
+
+ wkb_type = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ switch ((enum wkbType) wkb_type)
+ {
+ case wkbPoint:
+ res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbLineString:
+ res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbPolygon:
+ res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbMultiPoint:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbMultiLineString:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbMultiPolygon:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbGeometryCollection:
+ {
+ uint n_items;
+
+ if (!top)
+ return -1;
+
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ default:
+ res = -1;
+ }
+ return res;
+}
+
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c
new file mode 100644
index 00000000000..7a413f68135
--- /dev/null
+++ b/storage/maria/ma_sp_test.c
@@ -0,0 +1,568 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA spatial table */
+/* Written by Alex Barkov, who has a shared copyright to this code */
+
+#include "maria.h"
+
+#ifdef HAVE_SPATIAL
+#include "ma_sp_defs.h"
+
+#define MAX_REC_LENGTH 1024
+#define KEYALG HA_KEY_ALG_RTREE
+
+static void create_linestring(char *record,uint rownr);
+static void print_record(char * record,my_off_t offs,const char * tail);
+
+static void create_key(char *key,uint rownr);
+static void print_key(const char *key,const char * tail);
+
+static int run_test(const char *filename);
+static int read_with_pos(MARIA_HA * file, int silent);
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+ uchar *wkb);
+static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims);
+
+static char blob_key[MAX_REC_LENGTH];
+
+
+int main(int argc __attribute__((unused)),char *argv[])
+{
+ MY_INIT(argv[0]);
+ maria_init();
+ exit(run_test("sp_test"));
+}
+
+
+int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+ MARIA_COLUMNDEF recinfo[20];
+ MARIA_KEYDEF keyinfo[20];
+ HA_KEYSEG keyseg[20];
+ key_range min_range, max_range;
+ int silent=0;
+ int create_flag=0;
+ int null_fields=0;
+ int nrecords=30;
+ int uniques=0;
+ int i;
+ int error;
+ int row_count=0;
+ char record[MAX_REC_LENGTH];
+ char key[MAX_REC_LENGTH];
+ char read_record[MAX_REC_LENGTH];
+ int upd=10;
+ ha_rows hrows;
+
+ /* Define a column for NULLs and DEL markers*/
+
+ recinfo[0].type=FIELD_NORMAL;
+ recinfo[0].length=1; /* For NULL bits */
+
+
+ /* Define spatial column */
+
+ recinfo[1].type=FIELD_BLOB;
+ recinfo[1].length=4 + portable_sizeof_char_ptr;
+
+
+
+ /* Define a key with 1 spatial segment */
+
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag=HA_SPATIAL;
+ keyinfo[0].key_alg=KEYALG;
+
+ keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY;
+ keyinfo[0].seg[0].flag=0;
+ keyinfo[0].seg[0].start= 1;
+ keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */
+ keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language=default_charset_info->number;
+ keyinfo[0].seg[0].bit_start=4; /* Long BLOB */
+
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=10000000;
+
+ if (maria_create(filename,
+ DYNAMIC_RECORD,
+ 1, /* keys */
+ keyinfo,
+ 2, /* columns */
+ recinfo,uniques,&uniquedef,&create_info,create_flag))
+ goto err;
+
+ if (!silent)
+ printf("- Open isam-file\n");
+
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ for (i=0; i<nrecords; i++ )
+ {
+ create_linestring(record,i);
+ error=maria_write(file,record);
+ print_record(record,maria_position(file),"\n");
+ if (!error)
+ {
+ row_count++;
+ }
+ else
+ {
+ printf("maria_write: %d\n", error);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Deleting rows with position\n");
+ for (i=0; i < nrecords/4; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ error=maria_delete(file,read_record);
+ if (error)
+ {
+ printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ printf("- Updating rows with position\n");
+ for (i=0; i < nrecords/2 ; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"");
+ create_linestring(record,i+nrecords*upd);
+ printf("\t-> ");
+ print_record(record,maria_position(file),"\n");
+ error=maria_update(file,read_record,record);
+ if (error)
+ {
+ printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+ create_key(key, nrecords*4/5);
+ print_key(key," search for INTERSECT\n");
+
+ if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT)))
+ {
+ printf("maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rkey\n");
+ row_count=1;
+
+ for (;;)
+ {
+ if ((error=maria_rnext_same(file,read_record)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext_same\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+ error=maria_rfirst(file,read_record,0);
+ if (error)
+ {
+ printf("maria_rfirst: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ row_count=1;
+ print_record(read_record,maria_position(file)," maria_frirst\n");
+
+ for(i=0;i<nrecords;i++) {
+ if ((error=maria_rnext(file,read_record,0)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_records_in_range()\n");
+
+ create_key(key, nrecords*upd);
+ print_key(key," INTERSECT\n");
+ min_range.key= key;
+ min_range.length= 1000; /* Big enough */
+ min_range.flag= HA_READ_MBR_INTERSECT;
+ max_range.key= record+1;
+ max_range.length= 1000; /* Big enough */
+ max_range.flag= HA_READ_KEY_EXACT;
+ hrows= maria_records_in_range(file,0, &min_range, &max_range);
+ printf(" %ld rows\n", (long) hrows);
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return 0;
+
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ maria_end();
+ return 1; /* skip warning */
+}
+
+
+static int read_with_pos (MARIA_HA * file,int silent)
+{
+ int error;
+ int i;
+ char read_record[MAX_REC_LENGTH];
+ int rows=0;
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ for (i=0;;i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ return error;
+ }
+ rows++;
+ print_record(read_record,maria_position(file),"\n");
+ }
+ printf(" %d rows\n",rows);
+ return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(char * record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ char * pos;
+ i=(unsigned char)record[0];
+ printf("%02X ",i);
+
+ for( pos=record+1, i=0; i<32; i++,pos++)
+ {
+ int b=(unsigned char)*pos;
+ printf("%02X",b);
+ }
+ printf("%s",tail);
+}
+#endif
+
+
+static void print_record(char * record, my_off_t offs,const char * tail)
+{
+ char *pos;
+ char *ptr;
+ uint len;
+
+ printf(" rec=(%d)",(unsigned char)record[0]);
+ pos=record+1;
+ len=sint4korr(pos);
+ pos+=4;
+ printf(" len=%d ",len);
+ memcpy_fixed(&ptr,pos,sizeof(char*));
+ if (ptr)
+ maria_rtree_PrintWKB((uchar*) ptr,SPDIMS);
+ else
+ printf("<NULL> ");
+ printf(" offs=%ld ",(long int)offs);
+ printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+static void create_point(char *record,uint rownr)
+{
+ uint tmp;
+ char *ptr;
+ char *pos=record;
+ double x[200];
+ int i;
+
+ for(i=0;i<SPDIMS;i++)
+ x[i]=rownr;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ *pos=0x01; /* DEL marker */
+ pos++;
+
+ memset(blob_key,0,sizeof(blob_key));
+ tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key);
+
+ int4store(pos,tmp);
+ pos+=4;
+
+ ptr=blob_key;
+ memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+#endif
+
+
+static void create_linestring(char *record,uint rownr)
+{
+ uint tmp;
+ char *ptr;
+ char *pos=record;
+ double x[200];
+ int i,j;
+ int npoints=2;
+
+ for(j=0;j<npoints;j++)
+ for(i=0;i<SPDIMS;i++)
+ x[i+j*SPDIMS]=rownr*j;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ *pos=0x01; /* DEL marker */
+ pos++;
+
+ memset(blob_key,0,sizeof(blob_key));
+ tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key);
+
+ int4store(pos,tmp);
+ pos+=4;
+
+ ptr=blob_key;
+ memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+
+
+static void create_key(char *key,uint rownr)
+{
+ double c=rownr;
+ char *pos;
+ uint i;
+
+ bzero(key,MAX_REC_LENGTH);
+ for ( pos=key, i=0; i<2*SPDIMS; i++)
+ {
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+static void print_key(const char *key,const char * tail)
+{
+ double c;
+ uint i;
+
+ printf(" key=");
+ for (i=0; i<2*SPDIMS; i++)
+ {
+ float8get(c,key);
+ key+=sizeof(c);
+ printf("%.14g ",c);
+ }
+ printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+
+static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb)
+{
+ uint i;
+
+ *wkb = wkbXDR;
+ ++wkb;
+ int4store(wkb, wkbPoint);
+ wkb += 4;
+
+ for (i=0; i < n_dims; ++i)
+ {
+ float8store(wkb, ords[i]);
+ wkb += 8;
+ }
+ return 5 + n_dims * 8;
+}
+#endif
+
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+ uchar *wkb)
+{
+ uint i;
+ uint n_ords = n_dims * n_points;
+
+ *wkb = wkbXDR;
+ ++wkb;
+ int4store(wkb, wkbLineString);
+ wkb += 4;
+ int4store(wkb, n_points);
+ wkb += 4;
+ for (i=0; i < n_ords; ++i)
+ {
+ float8store(wkb, ords[i]);
+ wkb += 8;
+ }
+ return 9 + n_points * n_dims * 8;
+}
+
+
+static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims)
+{
+ uint wkb_type;
+
+ ++wkb;
+ wkb_type = uint4korr(wkb);
+ wkb += 4;
+
+ switch ((enum wkbType)wkb_type)
+ {
+ case wkbPoint:
+ {
+ uint i;
+ double ord;
+
+ printf("POINT(");
+ for (i=0; i < n_dims; ++i)
+ {
+ float8get(ord, wkb);
+ wkb += 8;
+ printf("%.14g", ord);
+ if (i < n_dims - 1)
+ printf(" ");
+ else
+ printf(")");
+ }
+ break;
+ }
+ case wkbLineString:
+ {
+ uint p, i;
+ uint n_points;
+ double ord;
+
+ printf("LineString(");
+ n_points = uint4korr(wkb);
+ wkb += 4;
+ for (p=0; p < n_points; ++p)
+ {
+ for (i=0; i < n_dims; ++i)
+ {
+ float8get(ord, wkb);
+ wkb += 8;
+ printf("%.14g", ord);
+ if (i < n_dims - 1)
+ printf(" ");
+ }
+ if (p < n_points - 1)
+ printf(", ");
+ else
+ printf(")");
+ }
+ break;
+ }
+ case wkbPolygon:
+ {
+ printf("POLYGON(...)");
+ break;
+ }
+ case wkbMultiPoint:
+ {
+ printf("MULTIPOINT(...)");
+ break;
+ }
+ case wkbMultiLineString:
+ {
+ printf("MULTILINESTRING(...)");
+ break;
+ }
+ case wkbMultiPolygon:
+ {
+ printf("MULTIPOLYGON(...)");
+ break;
+ }
+ case wkbGeometryCollection:
+ {
+ printf("GEOMETRYCOLLECTION(...)");
+ break;
+ }
+ default:
+ {
+ printf("UNKNOWN GEOMETRY TYPE");
+ break;
+ }
+ }
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+ exit(0);
+}
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c
new file mode 100644
index 00000000000..41b202491a7
--- /dev/null
+++ b/storage/maria/ma_static.c
@@ -0,0 +1,79 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Static variables for MARIA library. All definied here for easy making of
+ a shared library
+*/
+
+#ifndef _global_h
+#include "maria_def.h"
+#include "trnman.h"
+#endif
+
+LIST *maria_open_list=0;
+uchar NEAR maria_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 9, '\001', };
+uchar NEAR maria_pack_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', };
+uint maria_quick_table_bits=9;
+ulong maria_block_size= MARIA_KEY_BLOCK_LENGTH;
+my_bool maria_flush= 0, maria_single_user= 0;
+my_bool maria_delay_key_write= 0;
+#if defined(THREAD) && !defined(DONT_USE_RW_LOCKS)
+ulong maria_concurrent_insert= 2;
+#else
+ulong maria_concurrent_insert= 0;
+#endif
+my_off_t maria_max_temp_length= MAX_FILE_SIZE;
+ulong maria_bulk_insert_tree_size=8192*1024;
+ulong maria_data_pointer_size= 4;
+
+PAGECACHE maria_pagecache_var;
+PAGECACHE *maria_pagecache= &maria_pagecache_var;
+
+PAGECACHE maria_log_pagecache_var;
+PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var;
+
+/**
+ @brief when transactionality does not matter we can use this transaction
+
+ Used in external programs like ma_test*, and also internally inside
+ libmaria when there is no transaction around and the operation isn't
+ transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR).
+*/
+TRN dummy_transaction_object;
+
+/* Enough for comparing if number is zero */
+uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+/*
+ read_vec[] is used for converting between P_READ_KEY.. and SEARCH_
+ Position is , == , >= , <= , > , <
+*/
+
+uint NEAR maria_read_vec[]=
+{
+ SEARCH_FIND, SEARCH_FIND | SEARCH_BIGGER, SEARCH_FIND | SEARCH_SMALLER,
+ SEARCH_NO_FIND | SEARCH_BIGGER, SEARCH_NO_FIND | SEARCH_SMALLER,
+ SEARCH_FIND | SEARCH_PREFIX, SEARCH_LAST, SEARCH_LAST | SEARCH_SMALLER,
+ MBR_CONTAIN, MBR_INTERSECT, MBR_WITHIN, MBR_DISJOINT, MBR_EQUAL
+};
+
+uint NEAR maria_readnext_vec[]=
+{
+ SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER,
+ SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER
+};
diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c
new file mode 100644
index 00000000000..ebfab4fad76
--- /dev/null
+++ b/storage/maria/ma_statrec.c
@@ -0,0 +1,294 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+ /* Functions to handle fixed-length-records */
+
+#include "maria_def.h"
+
+
+my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record)
+{
+ uchar temp[8]; /* max pointer length */
+ if (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end)
+ {
+ my_off_t filepos=info->s->state.dellink;
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info,(char*) &temp[0],info->s->base.rec_reflength,
+ info->s->state.dellink+1,
+ MYF(MY_NABP)))
+ goto err;
+ info->s->state.dellink= _ma_rec_pos(info->s,temp);
+ info->state->del--;
+ info->state->empty-=info->s->base.pack_reclength;
+ if (info->s->file_write(info, (char*) record, info->s->base.reclength,
+ filepos,
+ MYF(MY_NABP)))
+ goto err;
+ }
+ else
+ {
+ if (info->state->data_file_length > info->s->base.max_data_file_length-
+ info->s->base.pack_reclength)
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ return(2);
+ }
+ if (info->opt_flag & WRITE_CACHE_USED)
+ { /* Cash in use */
+ if (my_b_write(&info->rec_cache, (uchar*) record,
+ info->s->base.reclength))
+ goto err;
+ if (info->s->base.pack_reclength != info->s->base.reclength)
+ {
+ uint length=info->s->base.pack_reclength - info->s->base.reclength;
+ bzero((char*) temp,length);
+ if (my_b_write(&info->rec_cache, (uchar*) temp,length))
+ goto err;
+ }
+ }
+ else
+ {
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_write(info,(char*) record,info->s->base.reclength,
+ info->state->data_file_length,
+ info->s->write_flag))
+ goto err;
+ if (info->s->base.pack_reclength != info->s->base.reclength)
+ {
+ uint length=info->s->base.pack_reclength - info->s->base.reclength;
+ bzero((char*) temp,length);
+ if (info->s->file_write(info, (uchar*) temp,length,
+ info->state->data_file_length+
+ info->s->base.reclength,
+ info->s->write_flag))
+ goto err;
+ }
+ }
+ info->state->data_file_length+=info->s->base.pack_reclength;
+ info->s->state.split++;
+ }
+ return 0;
+ err:
+ return 1;
+}
+
+my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ return (info->s->file_write(info,
+ (char*) record,info->s->base.reclength,
+ pos,
+ MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_delete_static_record(MARIA_HA *info,
+ const uchar *record __attribute__ ((unused)))
+{
+ uchar temp[9]; /* 1+sizeof(uint32) */
+ info->state->del++;
+ info->state->empty+=info->s->base.pack_reclength;
+ temp[0]= '\0'; /* Mark that record is deleted */
+ _ma_dpointer(info,temp+1,info->s->state.dellink);
+ info->s->state.dellink= info->cur_row.lastpos;
+ info->rec_cache.seek_not_done=1;
+ return (info->s->file_write(info, temp, 1+info->s->rec_reflength,
+ info->cur_row.lastpos, MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_cmp_static_record(register MARIA_HA *info,
+ register const uchar *old)
+{
+ DBUG_ENTER("_ma_cmp_static_record");
+
+ /* We are going to do changes; dont let anybody disturb */
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ if (flush_io_cache(&info->rec_cache))
+ {
+ DBUG_RETURN(1);
+ }
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ }
+
+ if ((info->opt_flag & READ_CHECK_USED))
+ { /* If check isn't disabled */
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info, (char*) info->rec_buff,
+ info->s->base.reclength,
+ info->cur_row.lastpos,
+ MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ if (memcmp((uchar*) info->rec_buff, (uchar*) old,
+ (uint) info->s->base.reclength))
+ {
+ DBUG_DUMP("read",old,info->s->base.reclength);
+ DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength);
+ my_errno=HA_ERR_RECORD_CHANGED; /* Record have changed */
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ DBUG_ENTER("_ma_cmp_static_unique");
+
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info, (char*) info->rec_buff, info->s->base.reclength,
+ pos, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ DBUG_RETURN(_ma_unique_comp(def, record, (uchar*) info->rec_buff,
+ def->null_are_equal));
+}
+
+
+/*
+ Read a fixed-length-record
+
+ RETURN
+ 0 Ok
+ 1 record delete
+ -1 on read-error or locking-error
+*/
+
+int _ma_read_static_record(register MARIA_HA *info, register uchar *record,
+ MARIA_RECORD_POS pos)
+{
+ int error;
+
+ if (pos != HA_OFFSET_ERROR)
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file <= pos &&
+ flush_io_cache(&info->rec_cache))
+ return(my_errno);
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+
+ error=info->s->file_read(info,(char*) record,info->s->base.reclength,
+ pos, MYF(MY_NABP));
+ if (! error)
+ {
+ fast_ma_writeinfo(info);
+ if (!*record)
+ {
+ /* Record is deleted */
+ return ((my_errno=HA_ERR_RECORD_DELETED));
+ }
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ return(0);
+ }
+ }
+ fast_ma_writeinfo(info); /* No such record */
+ return(my_errno);
+}
+
+
+
+int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ int locked,error,cache_read;
+ uint cache_length;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_read_rnd_static_record");
+
+ cache_read=0;
+ cache_length=0;
+ if (info->opt_flag & READ_CACHE_USED)
+ { /* Cache in use */
+ if (filepos == my_b_tell(&info->rec_cache) &&
+ (skip_deleted_blocks || !filepos))
+ {
+ cache_read=1; /* Read record using cache */
+ cache_length=(uint) (info->rec_cache.read_end - info->rec_cache.read_pos);
+ }
+ else
+ info->rec_cache.seek_not_done=1; /* Filepos is changed */
+ }
+ locked=0;
+ if (info->lock_type == F_UNLCK)
+ {
+ if (filepos >= info->state->data_file_length)
+ { /* Test if new records */
+ if (_ma_readinfo(info,F_RDLCK,0))
+ DBUG_RETURN(my_errno);
+ locked=1;
+ }
+ else
+ { /* We don't nead new info */
+#ifndef UNSAFE_LOCKING
+ if ((! cache_read || share->base.reclength > cache_length) &&
+ share->tot_locks == 0)
+ { /* record not in cache */
+ locked=1;
+ }
+#else
+ info->tmp_lock_type=F_RDLCK;
+#endif
+ }
+ }
+ if (filepos >= info->state->data_file_length)
+ {
+ DBUG_PRINT("test",("filepos: %ld (%ld) records: %ld del: %ld",
+ (long) filepos/share->base.reclength, (long) filepos,
+ (long) info->state->records, (long) info->state->del));
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE);
+ }
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= filepos+share->base.pack_reclength;
+
+ if (! cache_read) /* No cacheing */
+ {
+ error= _ma_read_static_record(info, buf, filepos);
+ DBUG_RETURN(error);
+ }
+
+ /* Read record with cacheing */
+ error=my_b_read(&info->rec_cache,(uchar*) buf,share->base.reclength);
+ if (info->s->base.pack_reclength != info->s->base.reclength && !error)
+ {
+ char tmp[8]; /* Skill fill bytes */
+ error=my_b_read(&info->rec_cache,(uchar*) tmp,
+ info->s->base.pack_reclength - info->s->base.reclength);
+ }
+ if (locked)
+ VOID(_ma_writeinfo(info,0)); /* Unlock keyfile */
+ if (!error)
+ {
+ if (!buf[0])
+ { /* Record is removed */
+ DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED);
+ }
+ /* Found and may be updated */
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+ DBUG_RETURN(0);
+ }
+ /* my_errno should be set if rec_cache.error == -1 */
+ if (info->rec_cache.error != -1 || my_errno == 0)
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(my_errno); /* Something wrong (EOF?) */
+}
diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c
new file mode 100644
index 00000000000..80bd3c348a7
--- /dev/null
+++ b/storage/maria/ma_test1.c
@@ -0,0 +1,846 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA table */
+
+#include "maria_def.h"
+#include <my_getopt.h>
+#include <m_string.h>
+#include "ma_control_file.h"
+#include "ma_loghandler.h"
+#include "trnman.h"
+
+extern PAGECACHE *maria_log_pagecache;
+extern const char *maria_data_root;
+
+#define MAX_REC_LENGTH 1024
+
+static void usage();
+
+static int rec_pointer_size=0, flags[50], testflag;
+static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE;
+static int key_type=HA_KEYTYPE_NUM;
+static int create_flag=0;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint insert_count, update_count, remove_count;
+static uint pack_keys=0, pack_seg=0, key_length;
+static uint unique_key=HA_NOSAME;
+static uint die_in_middle_of_transaction;
+static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique;
+static my_bool verbose, skip_delete, transactional;
+static MARIA_COLUMNDEF recinfo[4];
+static MARIA_KEYDEF keyinfo[10];
+static HA_KEYSEG keyseg[10];
+static HA_KEYSEG uniqueseg[10];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_key(char *key,uint rownr);
+static void create_record(char *record,uint rownr);
+static void update_record(char *record);
+
+
+/*
+ These are here only for testing of recovery with undo. We are not
+ including maria_def.h here as this test is also to be an example of
+ how to use maria outside of the maria directory
+*/
+
+extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index);
+#define MARIA_FLUSH_DATA 1
+
+
+int main(int argc,char *argv[])
+{
+ MY_INIT(argv[0]);
+ my_init();
+ get_options(argc,argv);
+ maria_data_root= ".";
+ /* Maria requires that we always have a page cache */
+ if (maria_init() ||
+ (init_pagecache(maria_pagecache, IO_SIZE*16, 0, 0,
+ maria_block_size) == 0) ||
+ ma_control_file_create_or_open() ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS) ||
+ (transactional && trnman_init(0)))
+ {
+ fprintf(stderr, "Error in initialization");
+ exit(1);
+ }
+
+ exit(run_test("test1"));
+}
+
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ int i,j,error,deleted,rec_length,uniques=0;
+ uint offset_to_key;
+ ha_rows found,row_count;
+ char record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+
+ if (die_in_middle_of_transaction)
+ null_fields= 1;
+
+ bzero((char*) recinfo,sizeof(recinfo));
+ bzero((char*) &create_info,sizeof(create_info));
+
+ /* First define 2 columns */
+ create_info.null_bytes= 1;
+ recinfo[0].type= key_field;
+ recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+ key_length);
+ if (key_field == FIELD_VARCHAR)
+ recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length);
+ recinfo[1].type=extra_field;
+ recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24);
+ if (extra_field == FIELD_VARCHAR)
+ recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length);
+ recinfo[1].null_bit= null_fields ? 2 : 0;
+
+ if (opt_unique)
+ {
+ recinfo[2].type=FIELD_CHECK;
+ recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH;
+ }
+ rec_length= recinfo[0].length+recinfo[1].length+recinfo[2].length;
+
+ if (key_type == HA_KEYTYPE_VARTEXT1 &&
+ key_length > 255)
+ key_type= HA_KEYTYPE_VARTEXT2;
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].seg[0].type= key_type;
+ keyinfo[0].seg[0].flag= pack_seg;
+ keyinfo[0].seg[0].start=1;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ if (pack_seg & HA_BLOB_PART)
+ {
+ keyinfo[0].seg[0].bit_start=4; /* Length of blob length */
+ }
+ keyinfo[0].flag = (uint8) (pack_keys | unique_key);
+
+ bzero((uchar*) flags,sizeof(flags));
+ if (opt_unique)
+ {
+ uint start;
+ uniques=1;
+ bzero((char*) &uniquedef,sizeof(uniquedef));
+ bzero((char*) uniqueseg,sizeof(uniqueseg));
+ uniquedef.seg=uniqueseg;
+ uniquedef.keysegs=2;
+
+ /* Make a unique over all columns (except first NULL fields) */
+ for (i=0, start=1 ; i < 2 ; i++)
+ {
+ uniqueseg[i].start=start;
+ start+=recinfo[i].length;
+ uniqueseg[i].length=recinfo[i].length;
+ uniqueseg[i].language= default_charset_info->number;
+ }
+ uniqueseg[0].type= key_type;
+ uniqueseg[0].null_bit= null_fields ? 2 : 0;
+ uniqueseg[1].type= HA_KEYTYPE_TEXT;
+ if (extra_field == FIELD_BLOB)
+ {
+ uniqueseg[1].length=0; /* The whole blob */
+ uniqueseg[1].bit_start=4; /* long blob */
+ uniqueseg[1].flag|= HA_BLOB_PART;
+ }
+ else if (extra_field == FIELD_VARCHAR)
+ {
+ uniqueseg[1].flag|= HA_VAR_LENGTH_PART;
+ uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ?
+ HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2);
+ }
+ }
+ else
+ uniques=0;
+
+ offset_to_key= test(null_fields);
+ if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR)
+ offset_to_key+= 2;
+
+ if (!silent)
+ printf("- Creating maria file\n");
+ create_info.max_rows=(ulong) (rec_pointer_size ?
+ (1L << (rec_pointer_size*8))/40 :
+ 0);
+ create_info.transactional= transactional;
+ if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo,
+ uniques, &uniquedef, &create_info,
+ create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ if (maria_begin(file))
+ goto err;
+ my_errno=0;
+ row_count=deleted=0;
+ for (i=49 ; i>=1 ; i-=2 )
+ {
+ if (insert_count-- == 0) { VOID(maria_close(file)) ; exit(0) ; }
+ j=i%25 +1;
+ create_record(record,j);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ flags[j]=1;
+ if (verbose || error)
+ printf("J= %2d maria_write: %d errno: %d\n", j,error,my_errno);
+ }
+
+ if (maria_commit(file) || maria_begin(file))
+ goto err;
+
+ if (testflag == 1)
+ goto end;
+
+ /* Insert 2 rows with null values */
+ if (null_fields)
+ {
+ create_record(record,0);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ if (verbose || error)
+ printf("J= NULL maria_write: %d errno: %d\n", error,my_errno);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ if (verbose || error)
+ printf("J= NULL maria_write: %d errno: %d\n", error,my_errno);
+ flags[0]=2;
+ }
+
+ if (testflag == 2)
+ {
+ printf("Terminating after inserts\n");
+ goto end;
+ }
+
+ if (maria_commit(file) || maria_begin(file))
+ goto err;
+
+ if (!skip_update)
+ {
+ if (opt_unique)
+ {
+ if (!silent)
+ printf("- Checking unique constraint\n");
+ create_record(record,j);
+ if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE)
+ {
+ printf("unique check failed\n");
+ }
+ }
+ if (!silent)
+ printf("- Updating rows\n");
+
+ /* Update first last row to force extend of file */
+ if (maria_rsame(file,read_record,-1))
+ {
+ printf("Can't find last row with maria_rsame\n");
+ }
+ else
+ {
+ memcpy(record,read_record,rec_length);
+ update_record(record);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update last row: %.*s\n",
+ keyinfo[0].seg[0].length,read_record+1);
+ }
+ }
+
+ /* Read through all rows and update them */
+ assert(maria_scan_init(file) == 0);
+
+ found=0;
+ while ((error= maria_scan(file,read_record)) == 0)
+ {
+ if (--update_count == 0) { VOID(maria_close(file)) ; exit(0) ; }
+ memcpy(record,read_record,rec_length);
+ update_record(record);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update row: %.*s, error: %d\n",
+ keyinfo[0].seg[0].length,record+1,my_errno);
+ }
+ found++;
+ }
+ if (found != row_count)
+ printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count);
+ maria_scan_end(file);
+ }
+
+ if (testflag == 3)
+ {
+ printf("Terminating after updates\n");
+ goto end;
+ }
+ if (!silent)
+ printf("- Reopening file\n");
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ if (maria_begin(file))
+ goto err;
+ if (!skip_delete)
+ {
+ if (!silent)
+ printf("- Removing keys\n");
+
+ for (i=0 ; i <= 10 ; i++)
+ {
+ /*
+ If you want to debug the problem in ma_test_recovery with BLOBs
+ (see @todo there), you can break out of the loop after just one
+ delete, it is enough, like this:
+ if (i==1) break;
+ */
+ /* testing */
+ if (remove_count-- == 0)
+ {
+ fprintf(stderr,
+ "delete-rows number of rows deleted; Going down hard!\n");
+ goto end;
+ }
+ j=i*2;
+ if (!flags[j])
+ continue;
+ create_key(key,j);
+ my_errno=0;
+ if ((error = maria_rkey(file, read_record, 0, key,
+ HA_WHOLE_KEY, HA_READ_KEY_EXACT)))
+ {
+ if (verbose || (flags[j] >= 1 ||
+ (error && my_errno != HA_ERR_KEY_NOT_FOUND)))
+ printf("key: '%.*s' maria_rkey: %3d errno: %3d\n",
+ (int) key_length,key+offset_to_key,error,my_errno);
+ }
+ else
+ {
+ error=maria_delete(file,read_record);
+ if (verbose || error)
+ printf("key: '%.*s' maria_delete: %3d errno: %3d\n",
+ (int) key_length, key+offset_to_key, error, my_errno);
+ if (! error)
+ {
+ deleted++;
+ flags[j]--;
+ }
+ }
+ }
+ }
+
+ if (testflag == 4)
+ {
+ printf("Terminating after deletes\n");
+ goto end;
+ }
+
+ if (!silent)
+ printf("- Reading rows with key\n");
+ record[1]= 0; /* For nicer printf */
+ for (i=0 ; i <= 25 ; i++)
+ {
+ create_key(key,i);
+ my_errno=0;
+ error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT);
+ if (verbose ||
+ (error == 0 && flags[i] == 0 && unique_key) ||
+ (error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND)))
+ {
+ printf("key: '%.*s' maria_rkey: %3d errno: %3d record: %s\n",
+ (int) key_length,key+offset_to_key,error,my_errno,record+1);
+ }
+ }
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+
+ for (i=1,found=0 ; i <= 30 ; i++)
+ {
+ my_errno=0;
+ if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE)
+ {
+ if (found != row_count-deleted)
+ printf("Found only %ld of %ld rows\n", (ulong) found,
+ (ulong) (row_count - deleted));
+ break;
+ }
+ if (!error)
+ found++;
+ if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED &&
+ error != HA_ERR_END_OF_FILE))
+ {
+ printf("pos: %2d maria_rrnd: %3d errno: %3d record: %s\n",
+ i-1,error,my_errno,read_record+1);
+ }
+ }
+
+end:
+ if (die_in_middle_of_transaction)
+ {
+ /* As commit record is not done, UNDO entries needs to be rolled back */
+ switch (die_in_middle_of_transaction) {
+ case 1:
+ /*
+ Flush changed pages go to disk. That will also flush log. Recovery
+ will skip REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+ FLUSH_RELEASE);
+ break;
+ case 2:
+ /*
+ Just flush log. Pages are likely to not be on disk. Recovery will
+ then execute REDOs and UNDOs.
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ case 3:
+ /*
+ Flush nothing. Pages and log are likely to not be on disk. Recovery
+ will then do nothing.
+ */
+ break;
+ }
+ printf("Dying on request without maria_commit()/maria_close()\n");
+ exit(0);
+ }
+
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+
+static void create_key_part(char *key,uint rownr)
+{
+ if (!unique_key)
+ rownr&=7; /* Some identical keys */
+ if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM)
+ {
+ sprintf(key,"%*d",keyinfo[0].seg[0].length,rownr);
+ }
+ else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 ||
+ keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2)
+ { /* Alpha record */
+ /* Create a key that may be easily packed */
+ bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+ sprintf(key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+ if ((rownr & 7) == 0)
+ {
+ /* Change the key to force a unpack of the next key */
+ bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b');
+ }
+ }
+ else
+ { /* Alpha record */
+ if (keyinfo[0].seg[0].flag & HA_SPACE_PACK)
+ sprintf(key,"%-*d",keyinfo[0].seg[0].length,rownr);
+ else
+ {
+ /* Create a key that may be easily packed */
+ bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+ sprintf(key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+ if ((rownr & 7) == 0)
+ {
+ /* Change the key to force a unpack of the next key */
+ key[1]= (rownr < 10 ? 'a' : 'b');
+ }
+ }
+ }
+}
+
+
+static void create_key(char *key,uint rownr)
+{
+ if (keyinfo[0].seg[0].null_bit)
+ {
+ if (rownr == 0)
+ {
+ key[0]=1; /* null key */
+ key[1]=0; /* For easy print of key */
+ return;
+ }
+ *key++=0;
+ }
+ if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint tmp;
+ create_key_part(key+2,rownr);
+ tmp=strlen(key+2);
+ int2store(key,tmp);
+ }
+ else
+ create_key_part(key,rownr);
+}
+
+
+static char blob_key[MAX_REC_LENGTH];
+static char blob_record[MAX_REC_LENGTH+20*20];
+
+
+static void create_record(char *record,uint rownr)
+{
+ char *pos;
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=1; /* delete marker */
+ if (rownr == 0 && keyinfo[0].seg[0].null_bit)
+ record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */
+
+ pos=record+1;
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;
+ create_key_part(blob_key,rownr);
+ tmp=strlen(blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+ create_key_part(pos+pack_length,rownr);
+ tmp= strlen(pos+pack_length);
+ if (pack_length == 1)
+ *(uchar*) pos= (uchar) tmp;
+ else
+ int2store(pos,tmp);
+ pos+= recinfo[0].length;
+ }
+ else
+ {
+ create_key_part(pos,rownr);
+ pos+=recinfo[0].length;
+ }
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;;
+ sprintf(blob_record,"... row: %d", rownr);
+ strappend(blob_record,max(MAX_REC_LENGTH-rownr,10),' ');
+ tmp=strlen(blob_record);
+ int4store(pos,tmp);
+ ptr=blob_record;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+ sprintf(pos+pack_length, "... row: %d", rownr);
+ tmp= strlen(pos+pack_length);
+ if (pack_length == 1)
+ *(uchar*) pos= (uchar) tmp;
+ else
+ int2store(pos,tmp);
+ }
+ else
+ {
+ sprintf(pos,"... row: %d", rownr);
+ strappend(pos,recinfo[1].length,' ');
+ }
+}
+
+/* change row to test re-packing of rows and reallocation of keys */
+
+static void update_record(char *record)
+{
+ char *pos=record+1;
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ char *column,*ptr;
+ int length;
+ length=uint4korr(pos); /* Long blob */
+ memcpy_fixed(&column,pos+4,sizeof(char*));
+ memcpy(blob_key,column,length); /* Move old key */
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*)); /* Store pointer to new key */
+ if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+ default_charset_info->cset->casedn(default_charset_info,
+ blob_key, length, blob_key, length);
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+ uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+ default_charset_info->cset->casedn(default_charset_info,
+ pos + pack_length, length,
+ pos + pack_length, length);
+ pos+=recinfo[0].length;
+ }
+ else
+ {
+ if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+ default_charset_info->cset->casedn(default_charset_info,
+ pos, keyinfo[0].seg[0].length,
+ pos, keyinfo[0].seg[0].length);
+ pos+=recinfo[0].length;
+ }
+
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ char *column;
+ int length;
+ length=uint4korr(pos);
+ memcpy_fixed(&column,pos+4,sizeof(char*));
+ memcpy(blob_record,column,length);
+ bfill(blob_record+length,20,'.'); /* Make it larger */
+ length+=20;
+ int4store(pos,length);
+ column=blob_record;
+ memcpy_fixed(pos+4,&column,sizeof(char*));
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ /* Second field is longer than 10 characters */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+ uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+ pos= record+ recinfo[1].offset;
+ bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.');
+ length=recinfo[1].length-pack_length;
+ if (pack_length == 1)
+ *(uchar*) pos= (uchar) length;
+ else
+ int2store(pos,length);
+ }
+ else
+ {
+ bfill(pos+recinfo[1].length-10,10,'.');
+ }
+}
+
+
+static struct my_option my_long_options[] =
+{
+ {"checksum", 'c', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#', "Undocumented",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"delete-rows", 'd', "Abort after this many rows has been deleted",
+ (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG,
+ 1000, 0, 0, 0, 0, 0},
+ {"help", '?', "Display help and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count,
+ (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+ {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-binary-pack", 'B', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-blob", 'b', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing,
+ (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-length", 'k', "Undocumented", (uchar**) &key_length,
+ (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0},
+ {"key-multiple", 'm', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-prefix_pack", 'P', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-space_pack", 'p', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-varchar", 'w', "Test VARCHAR keys",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"null-fields", 'N', "Define fields with NULL",
+ (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"row-fixed-size", 'S', "Fixed size records",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"rows-in-block", 'M', "Store rows in block format",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size,
+ (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's', "Undocumented",
+ (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0,
+ 0, 0},
+ {"skip-delete", 'U', "Don't test deletes", (uchar**) &skip_delete,
+ (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"skip-update", 'D', "Don't test updates", (uchar**) &skip_update,
+ (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag,
+ (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"test-undo", 'A',
+ "Abort hard. Used for testing recovery with undo",
+ (uchar**) &die_in_middle_of_transaction,
+ (uchar**) &die_in_middle_of_transaction,
+ 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"transactional", 'T',
+ "Test in transactional mode. (Only works with block format)",
+ (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"unique", 'C', "Undocumented", (uchar**) &opt_unique,
+ (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count,
+ (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Be more verbose", (uchar**) &verbose,
+ (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version number and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'a':
+ key_type= HA_KEYTYPE_TEXT;
+ break;
+ case 'c':
+ create_flag|= HA_CREATE_CHECKSUM;
+ break;
+ case 'R': /* Length of record pointer */
+ if (rec_pointer_size > 3)
+ rec_pointer_size=0;
+ break;
+ case 'P':
+ pack_keys= HA_PACK_KEY; /* Use prefix compression */
+ break;
+ case 'B':
+ pack_keys= HA_BINARY_PACK_KEY; /* Use binary compression */
+ break;
+ case 'M':
+ record_type= BLOCK_RECORD;
+ break;
+ case 'S':
+ if (key_field == FIELD_VARCHAR)
+ {
+ create_flag=0; /* Static sized varchar */
+ record_type= STATIC_RECORD;
+ }
+ else if (key_field != FIELD_BLOB)
+ {
+ key_field=FIELD_NORMAL; /* static-size record */
+ extra_field=FIELD_NORMAL;
+ record_type= STATIC_RECORD;
+ }
+ break;
+ case 'p':
+ pack_keys=HA_PACK_KEY; /* Use prefix + space packing */
+ pack_seg=HA_SPACE_PACK;
+ key_type=HA_KEYTYPE_TEXT;
+ break;
+ case 'm':
+ unique_key=0;
+ break;
+ case 'b':
+ key_field=FIELD_BLOB; /* blob key */
+ extra_field= FIELD_BLOB;
+ pack_seg|= HA_BLOB_PART;
+ key_type= HA_KEYTYPE_VARTEXT1;
+ if (record_type == STATIC_RECORD)
+ record_type= DYNAMIC_RECORD;
+ break;
+ case 'k':
+ if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
+ {
+ fprintf(stderr,"Wrong key length\n");
+ exit(1);
+ }
+ break;
+ case 'w':
+ key_field=FIELD_VARCHAR; /* varchar keys */
+ extra_field= FIELD_VARCHAR;
+ key_type= HA_KEYTYPE_VARTEXT1;
+ pack_seg|= HA_VAR_LENGTH_PART;
+ if (record_type == STATIC_RECORD)
+ record_type= DYNAMIC_RECORD;
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ break;
+ case 'V':
+ printf("test1 Ver 1.2 \n");
+ exit(0);
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case '?':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("Usage: %s [options]\n\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c
new file mode 100644
index 00000000000..935be09850c
--- /dev/null
+++ b/storage/maria/ma_test2.c
@@ -0,0 +1,1180 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Test av isam-databas: stor test */
+
+#ifndef USE_MY_FUNC /* We want to be able to dbug this !! */
+#define USE_MY_FUNC
+#endif
+#ifdef DBUG_OFF
+#undef DBUG_OFF
+#endif
+#ifndef SAFEMALLOC
+#define SAFEMALLOC
+#endif
+#include "maria_def.h"
+#include "trnman.h"
+#include <m_ctype.h>
+#include <my_bit.h>
+
+
+#define STANDARD_LENGTH 37
+#define MARIA_KEYS 6
+#define MAX_PARTS 4
+#if !defined(MSDOS) && !defined(labs)
+#define labs(a) abs(a)
+#endif
+
+static void get_options(int argc, char *argv[]);
+static uint rnd(uint max_value);
+static void fix_length(uchar *record,uint length);
+static void put_blob_in_record(char *blob_pos,char **blob_buffer,
+ ulong *length);
+static void copy_key(struct st_maria_info *info,uint inx,
+ uchar *record,uchar *key);
+
+static int verbose=0,testflag=0,
+ first_key=0,async_io=0,pagecacheing=0,write_cacheing=0,locking=0,
+ rec_pointer_size=0,pack_fields=1,silent=0,
+ opt_quick_mode=0, transactional= 0, skip_update= 0,
+ die_in_middle_of_transaction= 0;
+static int pack_seg=HA_SPACE_PACK,pack_type=HA_PACK_KEY,remove_count=-1;
+static int create_flag= 0, srand_arg= 0;
+static ulong pagecache_size=IO_SIZE*16;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint keys=MARIA_KEYS,recant=1000;
+static uint use_blob=0;
+static uint16 key1[1001],key3[5000];
+static char record[300],record2[300],key[100],key2[100],
+ read_record[300],read_record2[300],read_record3[300];
+static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS];
+
+ /* Test program */
+
+int main(int argc, char *argv[])
+{
+ uint i;
+ int j,n1,n2,n3,error,k;
+ uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos,
+ reclength,ant,found_parts;
+ my_off_t lastpos;
+ ha_rows range_records,records;
+ MARIA_HA *file;
+ MARIA_KEYDEF keyinfo[10];
+ MARIA_COLUMNDEF recinfo[10];
+ MARIA_INFO info;
+ const char *filename;
+ char *blob_buffer;
+ MARIA_CREATE_INFO create_info;
+ MY_INIT(argv[0]);
+
+ filename= "test2";
+ get_options(argc,argv);
+ if (! async_io)
+ my_disable_async_io=1;
+
+ maria_data_root= ".";
+ /* Maria requires that we always have a page cache */
+ if (maria_init() ||
+ (init_pagecache(maria_pagecache, pagecache_size, 0, 0,
+ maria_block_size) == 0) ||
+ ma_control_file_create_or_open() ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS) ||
+ (transactional && trnman_init(0)))
+ {
+ fprintf(stderr, "Error in initialization");
+ exit(1);
+ }
+
+ reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0);
+ blob_pos=STANDARD_LENGTH+60;
+ keyinfo[0].seg= &glob_keyseg[0][0];
+ keyinfo[0].seg[0].start=0;
+ keyinfo[0].seg[0].length=6;
+ keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ keyinfo[0].seg[0].flag=(uint8) pack_seg;
+ keyinfo[0].seg[0].null_bit=0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag = pack_type;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[1].seg= &glob_keyseg[1][0];
+ keyinfo[1].seg[0].start=7;
+ keyinfo[1].seg[0].length=6;
+ keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY;
+ keyinfo[1].seg[0].flag=0;
+ keyinfo[1].seg[0].null_bit=0;
+ keyinfo[1].seg[0].null_pos=0;
+ keyinfo[1].seg[1].start=0; /* two part key */
+ keyinfo[1].seg[1].length=6;
+ keyinfo[1].seg[1].type=HA_KEYTYPE_NUM;
+ keyinfo[1].seg[1].flag=HA_REVERSE_SORT;
+ keyinfo[1].seg[1].null_bit=0;
+ keyinfo[1].seg[1].null_pos=0;
+ keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[1].keysegs=2;
+ keyinfo[1].flag =0;
+ keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH; /* Diff blocklength */
+ keyinfo[2].seg= &glob_keyseg[2][0];
+ keyinfo[2].seg[0].start=12;
+ keyinfo[2].seg[0].length=8;
+ keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY;
+ keyinfo[2].seg[0].flag=HA_REVERSE_SORT;
+ keyinfo[2].seg[0].null_bit=0;
+ keyinfo[2].seg[0].null_pos=0;
+ keyinfo[2].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[2].keysegs=1;
+ keyinfo[2].flag =HA_NOSAME;
+ keyinfo[2].block_length= 0; /* Default block length */
+ keyinfo[3].seg= &glob_keyseg[3][0];
+ keyinfo[3].seg[0].start=0;
+ keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0);
+ keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[3].seg[0].language=default_charset_info->number;
+ keyinfo[3].seg[0].flag=(uint8) pack_seg;
+ keyinfo[3].seg[0].null_bit=0;
+ keyinfo[3].seg[0].null_pos=0;
+ keyinfo[3].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[3].keysegs=1;
+ keyinfo[3].flag = pack_type;
+ keyinfo[3].block_length= 0; /* Default block length */
+ keyinfo[4].seg= &glob_keyseg[4][0];
+ keyinfo[4].seg[0].start=0;
+ keyinfo[4].seg[0].length=5;
+ keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[4].seg[0].language=default_charset_info->number;
+ keyinfo[4].seg[0].flag=0;
+ keyinfo[4].seg[0].null_bit=0;
+ keyinfo[4].seg[0].null_pos=0;
+ keyinfo[4].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[4].keysegs=1;
+ keyinfo[4].flag = pack_type;
+ keyinfo[4].block_length= 0; /* Default block length */
+ keyinfo[5].seg= &glob_keyseg[5][0];
+ keyinfo[5].seg[0].start=0;
+ keyinfo[5].seg[0].length=4;
+ keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[5].seg[0].language=default_charset_info->number;
+ keyinfo[5].seg[0].flag=pack_seg;
+ keyinfo[5].seg[0].null_bit=0;
+ keyinfo[5].seg[0].null_pos=0;
+ keyinfo[5].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[5].keysegs=1;
+ keyinfo[5].flag = pack_type;
+ keyinfo[5].block_length= 0; /* Default block length */
+
+ recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[0].length=7;
+ recinfo[0].null_bit=0;
+ recinfo[0].null_pos=0;
+ recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[1].length=5;
+ recinfo[1].null_bit=0;
+ recinfo[1].null_pos=0;
+ recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[2].length=9;
+ recinfo[2].null_bit=0;
+ recinfo[2].null_pos=0;
+ recinfo[3].type=FIELD_NORMAL;
+ recinfo[3].length=STANDARD_LENGTH-7-5-9-4;
+ recinfo[3].null_bit=0;
+ recinfo[3].null_pos=0;
+ recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0;
+ recinfo[4].length=4;
+ recinfo[4].null_bit=0;
+ recinfo[4].null_pos=0;
+ recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0;
+ recinfo[5].length=60;
+ recinfo[5].null_bit=0;
+ recinfo[5].null_pos=0;
+ if (use_blob)
+ {
+ recinfo[6].type=FIELD_BLOB;
+ recinfo[6].length=4+portable_sizeof_char_ptr;
+ recinfo[6].null_bit=0;
+ recinfo[6].null_pos=0;
+ }
+
+ write_count=update=dupp_keys=opt_delete=0;
+ blob_buffer=0;
+
+ for (i=1000 ; i>0 ; i--) key1[i]=0;
+ for (i=4999 ; i>0 ; i--) key3[i]=0;
+
+ if (!silent)
+ printf("- Creating maria-file\n");
+ file= 0;
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=(ha_rows) (rec_pointer_size ?
+ (1L << (rec_pointer_size*8))/
+ reclength : 0);
+ create_info.reloc_rows=(ha_rows) 100;
+ create_info.transactional= transactional;
+ if (maria_create(filename, record_type, keys,&keyinfo[first_key],
+ use_blob ? 7 : 6, &recinfo[0],
+ 0,(MARIA_UNIQUEDEF*) 0,
+ &create_info,create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ maria_begin(file);
+ if (testflag == 1)
+ goto end;
+ if (!silent)
+ printf("- Writing key:s\n");
+ if (locking)
+ maria_lock_database(file,F_WRLCK);
+ if (write_cacheing)
+ maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+ if (opt_quick_mode)
+ maria_extra(file,HA_EXTRA_QUICK,0);
+
+ for (i=0 ; i < recant ; i++)
+ {
+ ulong blob_length;
+#if 0
+ /*
+ Starting from i==72, there was a difference between runtime and
+ log-applying. This is now fixed, by not using non_header_data_len in
+ log-applying.
+ */
+ if (i == 72) goto end;
+#endif
+ n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+ sprintf(record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count);
+ int4store(record+STANDARD_LENGTH-4,(long) i);
+ fix_length(record,(uint) STANDARD_LENGTH+rnd(60));
+ put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length);
+ DBUG_PRINT("test",("record: %d blob_length: %lu", i, blob_length));
+
+ if (maria_write(file,record))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+ {
+ printf("Error: %d in write at record: %d\n",my_errno,i);
+ goto err;
+ }
+ if (verbose) printf(" Double key: %d at record# %d\n", n3, i);
+ }
+ else
+ {
+ if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3)
+ {
+ printf("Error: Didn't get error when writing second key: '%8d'\n",n3);
+ goto err;
+ }
+ write_count++; key1[n1]++; key3[n3]=1;
+ }
+
+ /* Check if we can find key without flushing database */
+ if (i % 10 == 0)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (!j)
+ for (j=999 ; j>0 && key1[j] == 0 ; j--) ;
+ sprintf(key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("Test in loop: Can't find key: \"%s\"\n",key);
+ goto err;
+ }
+ }
+ }
+ if (testflag == 2)
+ goto end;
+
+ if (write_cacheing)
+ {
+ if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+ {
+ puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+ goto err;
+ }
+ }
+#ifdef REMOVE_WHEN_WE_HAVE_RESIZE
+ if (pagecacheing)
+ resize_pagecache(maria_pagecache, maria_block_size,
+ pagecache_size * 2, 0, 0);
+#endif
+ if (!silent)
+ printf("- Delete\n");
+ if (srand_arg)
+ srand(srand_arg);
+ for (i=0 ; i<recant/10 ; i++)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (j != 0)
+ {
+ sprintf(key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("can't find key1: \"%s\"\n",key);
+ goto err;
+ }
+ if (bcmp(read_record+keyinfo[0].seg[0].start,
+ key, keyinfo[0].seg[0].length))
+ {
+ printf("Found wrong record when searching for key: \"%s\"\n",key);
+ goto err;
+ }
+ if (opt_delete == (uint) remove_count) /* While testing */
+ goto end;
+ if (maria_delete(file,read_record))
+ {
+ printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record);
+ goto err;
+ }
+ opt_delete++;
+ key1[atoi(read_record+keyinfo[0].seg[0].start)]--;
+ key3[atoi(read_record+keyinfo[2].seg[0].start)]=0;
+ }
+ else
+ puts("Warning: Skipping delete test because no dupplicate keys");
+ }
+ if (testflag == 3)
+ goto end;
+
+ if (!silent)
+ printf("- Update\n");
+ if (srand_arg)
+ srand(srand_arg);
+ for (i=0 ; i<recant/10 ; i++)
+ {
+ n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+ sprintf(record2,"%6d:%4d:%8d:XXX: %4d ",n1,n2,n3,update);
+ int4store(record2+STANDARD_LENGTH-4,(long) i);
+ fix_length(record2,(uint) STANDARD_LENGTH+rnd(60));
+
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (j != 0)
+ {
+ sprintf(key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("can't find key1: \"%s\"\n",key);
+ goto err;
+ }
+ if (bcmp(read_record+keyinfo[0].seg[0].start,
+ key, keyinfo[0].seg[0].length))
+ {
+ printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n",
+ key, keyinfo[0].seg[0].length,
+ read_record+keyinfo[0].seg[0].start);
+ goto err;
+ }
+ if (use_blob)
+ {
+ ulong blob_length;
+ if (i & 1)
+ put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length);
+ else
+ bmove(record+blob_pos,read_record+blob_pos,8);
+ }
+ if (skip_update)
+ continue;
+ if (maria_update(file,read_record,record2))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+ {
+ printf("error: %d; can't update:\nFrom: \"%s\"\nTo: \"%s\"\n",
+ my_errno,read_record,record2);
+ goto err;
+ }
+ if (verbose)
+ printf("Double key when tried to update:\nFrom: \"%s\"\nTo: \"%s\"\n",record,record2);
+ }
+ else
+ {
+ key1[atoi(read_record+keyinfo[0].seg[0].start)]--;
+ key3[atoi(read_record+keyinfo[2].seg[0].start)]=0;
+ key1[n1]++; key3[n3]=1;
+ update++;
+ }
+ }
+ }
+ if (testflag == 4)
+ goto end;
+
+ for (i=999, dupp_keys=j=0 ; i>0 ; i--)
+ {
+ if (key1[i] > dupp_keys)
+ {
+ dupp_keys=key1[i]; j=i;
+ }
+ }
+ sprintf(key,"%6d",j);
+ start=keyinfo[0].seg[0].start;
+ length=keyinfo[0].seg[0].length;
+ if (dupp_keys)
+ {
+ if (!silent)
+ printf("- Same key: first - next -> last - prev -> first\n");
+ DBUG_PRINT("progpos",("first - next -> last - prev -> first"));
+ if (verbose) printf(" Using key: \"%s\" Keys: %d\n",key,dupp_keys);
+
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_rsame(file,read_record2,-1))
+ goto err;
+ if (memcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame didn't find same record\n");
+ goto err;
+ }
+ info.recpos=maria_position(file);
+ if (maria_rfirst(file,read_record2,0) ||
+ maria_rsame_with_pos(file,read_record2,0,info.recpos) ||
+ memcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame_with_pos didn't find same record\n");
+ goto err;
+ }
+ {
+ info.recpos= maria_position(file);
+ int skr=maria_rnext(file,read_record2,0);
+ if ((skr && my_errno != HA_ERR_END_OF_FILE) ||
+ maria_rprev(file,read_record2,-1) ||
+ memcmp(read_record,read_record2,reclength) != 0 ||
+ info.recpos != maria_position(file))
+ {
+ printf("maria_rsame_with_pos lost position\n");
+ goto err;
+ }
+ }
+ ant=1;
+ while (maria_rnext(file,read_record2,0) == 0 &&
+ memcmp(read_record2+start,key,length) == 0) ant++;
+ if (ant != dupp_keys)
+ {
+ printf("next: Found: %d keys of %d\n",ant,dupp_keys);
+ goto err;
+ }
+ ant=0;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys)
+ {
+ printf("prev: Found: %d records of %d\n",ant,dupp_keys);
+ goto err;
+ }
+
+ /* Check of maria_rnext_same */
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ ant=1;
+ while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10)
+ ant++;
+ if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE)
+ {
+ printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ printf("- All keys: first - next -> last - prev -> first\n");
+ DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first"));
+ ant=1;
+ if (maria_rfirst(file,read_record,0))
+ {
+ printf("Can't find first record\n");
+ goto err;
+ }
+ while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10)
+ ant++;
+ if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE)
+ {
+ printf("next: I found: %d records of %d (error: %d)\n",
+ ant, write_count - opt_delete, error);
+ goto err;
+ }
+ if (maria_rlast(file,read_record2,0) ||
+ bcmp(read_record2,read_record3,reclength))
+ {
+ printf("Can't find last record\n");
+ DBUG_DUMP("record2",(uchar*) read_record2,reclength);
+ DBUG_DUMP("record3",(uchar*) read_record3,reclength);
+ goto err;
+ }
+ ant=1;
+ while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10)
+ ant++;
+ if (ant != write_count - opt_delete)
+ {
+ printf("prev: I found: %d records of %d\n",ant,write_count);
+ goto err;
+ }
+ if (bcmp(read_record,read_record3,reclength))
+ {
+ printf("Can't find first record\n");
+ goto err;
+ }
+
+ if (!silent)
+ printf("- Test if: Read first - next - prev - prev - next == first\n");
+ DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first"));
+ if (maria_rfirst(file,read_record,0) ||
+ maria_rnext(file,read_record3,0) ||
+ maria_rprev(file,read_record3,0) ||
+ maria_rprev(file,read_record3,0) == 0 ||
+ maria_rnext(file,read_record3,0))
+ goto err;
+ if (bcmp(read_record,read_record3,reclength) != 0)
+ printf("Can't find first record\n");
+
+ if (!silent)
+ printf("- Test if: Read last - prev - next - next - prev == last\n");
+ DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last"));
+ if (maria_rlast(file,read_record2,0) ||
+ maria_rprev(file,read_record3,0) ||
+ maria_rnext(file,read_record3,0) ||
+ maria_rnext(file,read_record3,0) == 0 ||
+ maria_rprev(file,read_record3,0))
+ goto err;
+ if (bcmp(read_record2,read_record3,reclength))
+ printf("Can't find last record\n");
+#ifdef NOT_ANYMORE
+ if (!silent)
+ puts("- Test read key-part");
+ strmov(key2,key);
+ for(i=strlen(key2) ; i-- > 1 ;)
+ {
+ key2[i]=0;
+
+ /* The following row is just to catch some bugs in the key code */
+ bzero((char*) file->lastkey,file->s->base.max_key_length*2);
+ if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX))
+ goto err;
+ if (bcmp(read_record+start,key,(uint) i))
+ {
+ puts("Didn't find right record");
+ goto err;
+ }
+ }
+#endif
+ if (dupp_keys > 2)
+ {
+ if (!silent)
+ printf("- Read key (first) - next - delete - next -> last\n");
+ DBUG_PRINT("progpos",("first - next - delete - next -> last"));
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_rnext(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ while (maria_rnext(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-1)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1);
+ goto err;
+ }
+ }
+ if (dupp_keys>4)
+ {
+ if (!silent)
+ printf("- Read last of key - prev - delete - prev -> first\n");
+ DBUG_PRINT("progpos",("last - prev - delete - prev -> first"));
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-2)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2);
+ goto err;
+ }
+ }
+ if (dupp_keys > 6)
+ {
+ if (!silent)
+ printf("- Read first - delete - next -> last\n");
+ DBUG_PRINT("progpos",("first - delete - next -> last"));
+ if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ if (maria_rnext(file,read_record,0))
+ goto err; /* Skall finnas poster */
+ while (maria_rnext(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-3)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3);
+ goto err;
+ }
+
+ if (!silent)
+ printf("- Read last - delete - prev -> first\n");
+ DBUG_PRINT("progpos",("last - delete - prev -> first"));
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=0;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-4)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ puts("- Test if: Read rrnd - same");
+ DBUG_PRINT("progpos",("Read rrnd - same"));
+ assert(maria_scan_init(file) == 0);
+ for (i=0 ; i < write_count ; i++)
+ {
+ int tmp;
+ if ((tmp= maria_scan(file,read_record)) &&
+ tmp != HA_ERR_END_OF_FILE &&
+ tmp != HA_ERR_RECORD_DELETED)
+ {
+ printf("Got error %d when scanning table\n", tmp);
+ break;
+ }
+ }
+ maria_scan_end(file);
+ if (i != write_count && i != write_count - opt_delete)
+ {
+ printf("Found wrong number of rows while scanning table\n");
+ goto err;
+ }
+
+ bmove(read_record2,read_record,reclength);
+ for (i=min(2,keys) ; i-- > 0 ;)
+ {
+ if (maria_rsame(file,read_record2,(int) i)) goto err;
+ if (bcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame didn't find same record\n");
+ goto err;
+ }
+ }
+ if (!silent)
+ puts("- Test maria_records_in_range");
+ maria_status(file,&info,HA_STATUS_VARIABLE);
+ for (i=0 ; i < info.keys ; i++)
+ {
+ key_range min_key, max_key;
+ if (maria_rfirst(file,read_record,(int) i) ||
+ maria_rlast(file,read_record2,(int) i))
+ goto err;
+ copy_key(file,(uint) i,(uchar*) read_record,(uchar*) key);
+ copy_key(file,(uint) i,(uchar*) read_record2,(uchar*) key2);
+ min_key.key= key;
+ min_key.keypart_map= HA_WHOLE_KEY;
+ min_key.flag= HA_READ_KEY_EXACT;
+ max_key.key= key2;
+ max_key.keypart_map= HA_WHOLE_KEY;
+ max_key.flag= HA_READ_AFTER_KEY;
+
+ range_records= maria_records_in_range(file,(int) i, &min_key, &max_key);
+ if (range_records < info.records*8/10 ||
+ range_records > info.records*12/10)
+ {
+ printf("maria_records_range returned %ld; Should be about %ld\n",
+ (long) range_records,(long) info.records);
+ goto err;
+ }
+ if (verbose)
+ {
+ printf("maria_records_range returned %ld; Exact is %ld (diff: %4.2g %%)\n",
+ (long) range_records, (long) info.records,
+ labs((long) range_records - (long) info.records)*100.0/
+ info.records);
+ }
+ }
+ for (i=0 ; i < 5 ; i++)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ;
+ if (j != 0 && k != 0)
+ {
+ key_range min_key, max_key;
+ if (j > k)
+ swap_variables(int, j, k);
+ sprintf(key,"%6d",j);
+ sprintf(key2,"%6d",k);
+
+ min_key.key= key;
+ min_key.keypart_map= HA_WHOLE_KEY;
+ min_key.flag= HA_READ_AFTER_KEY;
+ max_key.key= key2;
+ max_key.keypart_map= HA_WHOLE_KEY;
+ max_key.flag= HA_READ_BEFORE_KEY;
+ range_records= maria_records_in_range(file, 0, &min_key, &max_key);
+ records=0;
+ for (j++ ; j < k ; j++)
+ records+=key1[j];
+ if ((long) range_records < (long) records*7/10-2 ||
+ (long) range_records > (long) records*14/10+2)
+ {
+ printf("maria_records_range for key: %d returned %lu; Should be about %lu\n",
+ i, (ulong) range_records, (ulong) records);
+ goto err;
+ }
+ if (verbose && records)
+ {
+ printf("maria_records_range returned %lu; Exact is %lu (diff: %4.2g %%)\n",
+ (ulong) range_records, (ulong) records,
+ labs((long) range_records-(long) records)*100.0/records);
+
+ }
+ }
+ }
+
+ if (!silent)
+ printf("- maria_info\n");
+ maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ if (info.records != write_count-opt_delete || info.deleted > opt_delete + update
+ || info.keys != keys)
+ {
+ puts("Wrong info from maria_info");
+ printf("Got: records: %lu delete: %lu i_keys: %d\n",
+ (ulong) info.records, (ulong) info.deleted, info.keys);
+ goto err;
+ }
+ if (verbose)
+ {
+ char buff[80];
+ get_date(buff,3,info.create_time);
+ printf("info: Created %s\n",buff);
+ get_date(buff,3,info.check_time);
+ printf("info: checked %s\n",buff);
+ get_date(buff,3,info.update_time);
+ printf("info: Modified %s\n",buff);
+ }
+
+ maria_panic(HA_PANIC_WRITE);
+ maria_panic(HA_PANIC_READ);
+ if (maria_is_changed(file))
+ puts("Warning: maria_is_changed reported that datafile was changed");
+
+ if (!silent)
+ printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n");
+ if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0))
+ {
+ if (locking || (!use_blob && !pack_fields))
+ {
+ puts("got error from maria_extra(HA_EXTRA_CACHE)");
+ goto err;
+ }
+ }
+ ant=0;
+ assert(maria_scan_init(file) == 0);
+ while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+ ant < write_count + 10)
+ ant+= error ? 0 : 1;
+ maria_scan_end(file);
+ if (ant != write_count-opt_delete)
+ {
+ printf("scan with cache: I can only find: %d records of %d\n",
+ ant,write_count-opt_delete);
+ goto err;
+ }
+ if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+ {
+ puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+ goto err;
+ }
+
+ ant=0;
+ maria_scan_init(file);
+ while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+ ant < write_count + 10)
+ ant+= error ? 0 : 1;
+ if (ant != write_count-opt_delete)
+ {
+ printf("scan with cache: I can only find: %d records of %d\n",
+ ant,write_count-opt_delete);
+ goto err;
+ }
+
+ if (testflag == 5)
+ goto end;
+
+ if (!silent)
+ printf("- Removing keys\n");
+ DBUG_PRINT("progpos",("Removing keys"));
+ lastpos = HA_OFFSET_ERROR;
+ /* DBUG_POP(); */
+ maria_reset(file);
+ found_parts=0;
+ maria_scan_init(file);
+ while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE)
+ {
+ info.recpos=maria_position(file);
+ if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR)
+ {
+ printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n",
+ (long) lastpos, (long) info.recpos);
+ goto err;
+ }
+ lastpos=info.recpos;
+ if (error == 0)
+ {
+ if (opt_delete == (uint) remove_count) /* While testing */
+ goto end;
+ if (rnd(2) == 1 && maria_rsame(file,read_record,-1))
+ {
+ printf("can't find record %lx\n",(long) info.recpos);
+ goto err;
+ }
+ if (use_blob)
+ {
+ ulong blob_length,pos;
+ uchar *ptr;
+ memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr));
+ longget(blob_length,read_record+blob_pos);
+ for (pos=0 ; pos < blob_length ; pos++)
+ {
+ if (ptr[pos] != (uchar) (blob_length+pos))
+ {
+ printf("Found blob with wrong info at %ld\n",(long) lastpos);
+ maria_scan_end(file);
+ my_errno= 0;
+ goto err;
+ }
+ }
+ }
+ if (maria_delete(file,read_record))
+ {
+ printf("can't delete record: %6.6s, delete_count: %d\n",
+ read_record, opt_delete);
+ maria_scan_end(file);
+ goto err;
+ }
+ opt_delete++;
+#if 0
+ /
+ /*
+ 179 is ok, 180 causes a difference between runtime and log-applying.
+ This is now fixed (we zero the last directory entry during
+ log-applying, just to eliminate this irrelevant difference).
+ */
+ if (opt_delete==180) goto end;
+#endif
+ }
+ else
+ found_parts++;
+ }
+ maria_scan_end(file);
+ if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED)
+ printf("error: %d from maria_rrnd\n",my_errno);
+ if (write_count != opt_delete)
+ {
+ printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count,
+ found_parts);
+ goto err;
+ }
+end:
+ if (die_in_middle_of_transaction)
+ {
+ /* As commit record is not done, UNDO entries needs to be rolled back */
+ switch (die_in_middle_of_transaction) {
+ case 1:
+ /*
+ Flush changed pages go to disk. That will also flush log. Recovery
+ will skip REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+ FLUSH_RELEASE);
+ break;
+ case 2:
+ /*
+ Just flush log. Pages are likely to not be on disk. Recovery will
+ then execute REDOs and UNDOs.
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ case 3:
+ /*
+ Flush nothing. Pages and log are likely to not be on disk. Recovery
+ will then do nothing.
+ */
+ break;
+ }
+ printf("Dying on request without maria_commit()/maria_close()\n");
+ exit(0);
+ }
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ {
+ file= 0;
+ goto err;
+ }
+ file= 0;
+ maria_panic(HA_PANIC_CLOSE); /* Should close log */
+ if (!silent)
+ {
+ printf("\nFollowing test have been made:\n");
+ printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete);
+ if (rec_pointer_size)
+ printf("Record pointer size: %d\n",rec_pointer_size);
+ printf("maria_block_size: %lu\n", maria_block_size);
+ if (write_cacheing)
+ puts("Key cache resized");
+ if (write_cacheing)
+ puts("Write cacheing used");
+ if (write_cacheing)
+ puts("quick mode");
+ if (async_io && locking)
+ puts("Asyncron io with locking used");
+ else if (locking)
+ puts("Locking used");
+ if (use_blob)
+ puts("blobs used");
+ printf("key cache status: \n\
+blocks used:%10lu\n\
+not flushed:%10lu\n\
+w_requests: %10lu\n\
+writes: %10lu\n\
+r_requests: %10lu\n\
+reads: %10lu\n",
+ maria_pagecache->blocks_used,
+ maria_pagecache->global_blocks_changed,
+ (ulong) maria_pagecache->global_cache_w_requests,
+ (ulong) maria_pagecache->global_cache_write,
+ (ulong) maria_pagecache->global_cache_r_requests,
+ (ulong) maria_pagecache->global_cache_read);
+ }
+ end_pagecache(maria_pagecache,1);
+ my_free(blob_buffer, MYF(MY_ALLOW_ZERO_PTR));
+ my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO);
+ return(0);
+err:
+ printf("got error: %d when using MARIA-database\n",my_errno);
+ if (file)
+ {
+ if (maria_commit(file))
+ goto err;
+ VOID(maria_close(file));
+ }
+ maria_end();
+ return(1);
+} /* main */
+
+
+/* Read options */
+
+static void get_options(int argc, char **argv)
+{
+ char *pos,*progname;
+
+ progname= argv[0];
+
+ while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+ switch(*++pos) {
+ case 'B':
+ pack_type= HA_BINARY_PACK_KEY;
+ break;
+ case 'b':
+ use_blob= 1;
+ if (*++pos)
+ use_blob= atol(pos);
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ if (*++pos)
+ pagecache_size=atol(pos);
+ break;
+ case 'W': /* Use write cacheing */
+ write_cacheing=1;
+ if (*++pos)
+ my_default_record_cache_size=atoi(pos);
+ break;
+ case 'd':
+ remove_count= atoi(++pos);
+ break;
+ case 'i':
+ if (*++pos)
+ srand(srand_arg= atoi(pos));
+ break;
+ case 'L':
+ locking=1;
+ break;
+ case 'A': /* use asyncron io */
+ async_io=1;
+ if (*++pos)
+ my_default_record_cache_size=atoi(pos);
+ break;
+ case 'v': /* verbose */
+ verbose=1;
+ break;
+ case 'm': /* records */
+ if ((recant=atoi(++pos)) < 10 && testflag > 2)
+ {
+ fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n");
+ exit(1);
+ }
+ break;
+ case 'e': /* maria_block_length */
+ case 'E':
+ if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH ||
+ maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH)
+ {
+ fprintf(stderr,"Wrong maria_block_length\n");
+ exit(1);
+ }
+ maria_block_size= my_round_up_to_next_power(maria_block_size);
+ break;
+ case 'f':
+ if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS)
+ first_key=0;
+ break;
+ case 'k':
+ if ((keys=(uint) atoi(++pos)) < 1 ||
+ keys > (uint) (MARIA_KEYS-first_key))
+ keys=MARIA_KEYS-first_key;
+ break;
+ case 'M':
+ record_type= BLOCK_RECORD;
+ break;
+ case 'P':
+ pack_type=0; /* Don't use DIFF_LENGTH */
+ pack_seg=0;
+ break;
+ case 'R': /* Length of record pointer */
+ rec_pointer_size=atoi(++pos);
+ if (rec_pointer_size > 7)
+ rec_pointer_size=0;
+ break;
+ case 'S':
+ pack_fields=0; /* Static-length-records */
+ record_type= STATIC_RECORD;
+ break;
+ case 's':
+ silent=1;
+ break;
+ case 't':
+ testflag=atoi(++pos); /* testmod */
+ break;
+ case 'T':
+ transactional= 1;
+ break;
+ case 'u':
+ die_in_middle_of_transaction= atoi(++pos);
+ break;
+ case 'q':
+ opt_quick_mode=1;
+ break;
+ case 'c':
+ create_flag|= HA_CREATE_CHECKSUM;
+ break;
+ case 'D':
+ create_flag|=HA_CREATE_DELAY_KEY_WRITE;
+ break;
+ case 'g':
+ skip_update= TRUE;
+ break;
+ case '?':
+ case 'I':
+ case 'V':
+ printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+ puts("By Monty, for your professional use\n");
+ printf("Usage: %s [-?AbBcDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n",
+ progname);
+ exit(0);
+ case '#':
+ DBUG_PUSH (++pos);
+ break;
+ default:
+ printf("Illegal option: '%c'\n",*pos);
+ break;
+ }
+ }
+ return;
+} /* get options */
+
+ /* Get a random value 0 <= x <= n */
+
+static uint rnd(uint max_value)
+{
+ return (uint) ((rand() & 32767)/32767.0*max_value);
+} /* rnd */
+
+
+ /* Create a variable length record */
+
+static void fix_length(uchar *rec, uint length)
+{
+ bmove(rec+STANDARD_LENGTH,
+ "0123456789012345678901234567890123456789012345678901234567890",
+ length-STANDARD_LENGTH);
+ strfill(rec+length,STANDARD_LENGTH+60-length,' ');
+} /* fix_length */
+
+
+ /* Put maybe a blob in record */
+
+static void put_blob_in_record(char *blob_pos, char **blob_buffer,
+ ulong *blob_length)
+{
+ ulong i,length;
+ if (use_blob)
+ {
+ if (rnd(10) == 0)
+ {
+ if (! *blob_buffer &&
+ !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME))))
+ {
+ use_blob=0;
+ return;
+ }
+ length=rnd(use_blob);
+ for (i=0 ; i < length ; i++)
+ (*blob_buffer)[i]=(char) (length+i);
+ int4store(blob_pos,length);
+ memcpy_fixed(blob_pos+4,(char*) blob_buffer,sizeof(char*));
+ *blob_length= length;
+ }
+ else
+ {
+ int4store(blob_pos,0);
+ *blob_length= 0;
+ }
+ }
+ return;
+}
+
+
+static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff)
+{
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++)
+ {
+ memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length);
+ key_buff+=keyseg->length;
+ }
+ return;
+}
diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c
new file mode 100644
index 00000000000..c25dd5dcdc6
--- /dev/null
+++ b/storage/maria/ma_test3.c
@@ -0,0 +1,500 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Test av locking */
+
+#ifndef __NETWARE__
+
+#include "maria.h"
+#include <sys/types.h>
+#ifdef HAVE_SYS_WAIT_H
+# include <sys/wait.h>
+#endif
+#ifndef WEXITSTATUS
+# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
+#endif
+#ifndef WIFEXITED
+# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
+#endif
+
+
+#if defined(HAVE_LRAND48)
+#define rnd(X) (lrand48() % X)
+#define rnd_init(X) srand48(X)
+#else
+#define rnd(X) (random() % X)
+#define rnd_init(X) srandom(X)
+#endif
+
+
+const char *filename= "test3";
+uint tests=10,forks=10,pagecacheing=0;
+
+static void get_options(int argc, char *argv[]);
+void start_test(int id);
+int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int),
+ test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int);
+
+struct record {
+ char id[8];
+ char nr[4];
+ char text[10];
+} record;
+
+
+int main(int argc,char **argv)
+{
+ int status,wait_ret;
+ uint i=0;
+ MARIA_KEYDEF keyinfo[10];
+ MARIA_COLUMNDEF recinfo[10];
+ HA_KEYSEG keyseg[10][2];
+ MY_INIT(argv[0]);
+ get_options(argc,argv);
+
+ fprintf(stderr, "WARNING! this program is to test 'external locking'"
+ " (when several processes share a table through file locking)"
+ " which is not supported by Maria at all; expect errors."
+ " We may soon remove this program.\n");
+ maria_init();
+ bzero((char*) keyinfo,sizeof(keyinfo));
+ bzero((char*) recinfo,sizeof(recinfo));
+ bzero((char*) keyseg,sizeof(keyseg));
+ keyinfo[0].seg= &keyseg[0][0];
+ keyinfo[0].seg[0].start=0;
+ keyinfo[0].seg[0].length=8;
+ keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].flag=HA_SPACE_PACK;
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag = (uint8) HA_PACK_KEY;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[1].seg= &keyseg[1][0];
+ keyinfo[1].seg[0].start=8;
+ keyinfo[1].seg[0].length=4; /* Long is always 4 in maria */
+ keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT;
+ keyinfo[1].seg[0].flag=0;
+ keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[1].keysegs=1;
+ keyinfo[1].flag =HA_NOSAME;
+ keyinfo[1].block_length= 0; /* Default block length */
+
+ recinfo[0].type=0;
+ recinfo[0].length=sizeof(record.id);
+ recinfo[1].type=0;
+ recinfo[1].length=sizeof(record.nr);
+ recinfo[2].type=0;
+ recinfo[2].length=sizeof(record.text);
+
+ puts("- Creating maria-file");
+ my_delete(filename,MYF(0)); /* Remove old locks under gdb */
+ if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0,
+ (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0))
+ exit(1);
+
+ rnd_init(0);
+ printf("- Starting %d processes\n",forks); fflush(stdout);
+ for (i=0 ; i < forks; i++)
+ {
+ if (!fork())
+ {
+ start_test(i+1);
+ sleep(1);
+ return 0;
+ }
+ VOID(rnd(1));
+ }
+
+ for (i=0 ; i < forks ; i++)
+ while ((wait_ret=wait(&status)) && wait_ret == -1);
+ maria_end();
+ return 0;
+}
+
+
+static void get_options(int argc, char **argv)
+{
+ char *pos,*progname;
+
+ progname= argv[0];
+
+ while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+ switch(*++pos) {
+ case 'f':
+ forks=atoi(++pos);
+ break;
+ case 't':
+ tests=atoi(++pos);
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ break;
+ case 'A': /* All flags */
+ pagecacheing=1;
+ break;
+ case '?':
+ case 'I':
+ case 'V':
+ printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+ puts("By Monty, for your professional use\n");
+ puts("Test av locking with threads\n");
+ printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname);
+ exit(0);
+ case '#':
+ DBUG_PUSH (++pos);
+ break;
+ default:
+ printf("Illegal option: '%c'\n",*pos);
+ break;
+ }
+ }
+ return;
+}
+
+
+void start_test(int id)
+{
+ uint i;
+ int error,lock_type;
+ MARIA_INFO isam_info;
+ MARIA_HA *file,*file1,*file2=0,*lock;
+
+ if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) ||
+ !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)))
+ {
+ fprintf(stderr,"Can't open isam-file: %s\n",filename);
+ exit(1);
+ }
+ if (pagecacheing && rnd(2) == 0)
+ init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH);
+ printf("Process %d, pid: %d\n",id,getpid()); fflush(stdout);
+
+ for (error=i=0 ; i < tests && !error; i++)
+ {
+ file= (rnd(2) == 1) ? file1 : file2;
+ lock=0 ; lock_type=0;
+ if (rnd(10) == 0)
+ {
+ if (maria_lock_database(lock=(rnd(2) ? file1 : file2),
+ lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK)))
+ {
+ fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno);
+ error=1;
+ break;
+ }
+ }
+ switch (rnd(4)) {
+ case 0: error=test_read(file,id); break;
+ case 1: error=test_rrnd(file,id); break;
+ case 2: error=test_write(file,id,lock_type); break;
+ case 3: error=test_update(file,id,lock_type); break;
+ }
+ if (lock)
+ maria_lock_database(lock,F_UNLCK);
+ }
+ if (!error)
+ {
+ maria_status(file1,&isam_info,HA_STATUS_VARIABLE);
+ printf("%2d: End of test. Records: %ld Deleted: %ld\n",
+ id,(long) isam_info.records, (long) isam_info.deleted);
+ fflush(stdout);
+ }
+
+ maria_close(file1);
+ maria_close(file2);
+ if (error)
+ {
+ printf("%2d: Aborted\n",id); fflush(stdout);
+ exit(1);
+ }
+}
+
+
+int test_read(MARIA_HA *file,int id)
+{
+ uint i,lock,found,next,prev;
+ ulong find;
+
+ lock=0;
+ if (rnd(2) == 0)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_RDLCK))
+ {
+ fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno);
+ return 1;
+ }
+ }
+
+ found=next=prev=0;
+ for (i=0 ; i < 100 ; i++)
+ {
+ find=rnd(100000);
+ if (!maria_rkey(file,record.id,1,(uchar*) &find,
+ HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ found++;
+ else
+ {
+ if (my_errno != HA_ERR_KEY_NOT_FOUND)
+ {
+ fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rnext(file,record.id,1))
+ next++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rprev(file,record.id,1))
+ prev++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in read\n",
+ id,my_errno);
+ return 1;
+ }
+ }
+ }
+ }
+ }
+ if (lock)
+ {
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ return 1;
+ }
+ }
+ printf("%2d: read: found: %5d next: %5d prev: %5d\n",
+ id,found,next,prev);
+ fflush(stdout);
+ return 0;
+}
+
+
+int test_rrnd(MARIA_HA *file,int id)
+{
+ uint count,lock;
+
+ lock=0;
+ if (rnd(2) == 0)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_RDLCK))
+ {
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ maria_close(file);
+ return 1;
+ }
+ if (rnd(2) == 0)
+ maria_extra(file,HA_EXTRA_CACHE,0);
+ }
+
+ count=0;
+ if (maria_rrnd(file,record.id,0L))
+ {
+ if (my_errno == HA_ERR_END_OF_FILE)
+ goto end;
+ fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno);
+ return 1;
+ }
+ for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ;
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno);
+ return 1;
+ }
+
+end:
+ if (lock)
+ {
+ maria_extra(file,HA_EXTRA_NO_CACHE,0);
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ exit(0);
+ }
+ }
+ printf("%2d: rrnd: %5d\n",id,count); fflush(stdout);
+ return 0;
+}
+
+
+int test_write(MARIA_HA *file,int id,int lock_type)
+{
+ uint i,tries,count,lock;
+
+ lock=0;
+ if (rnd(2) == 0 || lock_type == F_RDLCK)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_WRLCK))
+ {
+ if (lock_type == F_RDLCK && my_errno == EDEADLK)
+ {
+ printf("%2d: write: deadlock\n",id); fflush(stdout);
+ return 0;
+ }
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ maria_close(file);
+ return 1;
+ }
+ if (rnd(2) == 0)
+ maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+ }
+
+ sprintf(record.id,"%7d",getpid());
+ strnmov(record.text,"Testing...", sizeof(record.text));
+
+ tries=(uint) rnd(100)+10;
+ for (i=count=0 ; i < tries ; i++)
+ {
+ uint32 tmp=rnd(80000)+20000;
+ int4store(record.nr,tmp);
+ if (!maria_write(file,record.id))
+ count++;
+ else
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+ {
+ fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno,
+ errno);
+ return 1;
+ }
+ }
+ }
+ if (lock)
+ {
+ maria_extra(file,HA_EXTRA_NO_CACHE,0);
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ exit(0);
+ }
+ }
+ printf("%2d: write: %5d\n",id,count); fflush(stdout);
+ return 0;
+}
+
+
+int test_update(MARIA_HA *file,int id,int lock_type)
+{
+ uint i,lock,found,next,prev,update;
+ uint32 tmp;
+ char find[4];
+ struct record new_record;
+
+ lock=0;
+ if (rnd(2) == 0 || lock_type == F_RDLCK)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_WRLCK))
+ {
+ if (lock_type == F_RDLCK && my_errno == EDEADLK)
+ {
+ printf("%2d: write: deadlock\n",id); fflush(stdout);
+ return 0;
+ }
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ return 1;
+ }
+ }
+ bzero((char*) &new_record,sizeof(new_record));
+ strmov(new_record.text,"Updated");
+
+ found=next=prev=update=0;
+ for (i=0 ; i < 100 ; i++)
+ {
+ tmp=rnd(100000);
+ int4store(find,tmp);
+ if (!maria_rkey(file,record.id,1,(uchar*) find,
+ HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ found++;
+ else
+ {
+ if (my_errno != HA_ERR_KEY_NOT_FOUND)
+ {
+ fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rnext(file,record.id,1))
+ next++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+ id,my_errno);
+ return 1;
+ }
+ else if (!maria_rprev(file,record.id,1))
+ prev++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+ id,my_errno);
+ return 1;
+ }
+ continue;
+ }
+ }
+ }
+ memcpy_fixed(new_record.id,record.id,sizeof(record.id));
+ tmp=rnd(20000)+40000;
+ int4store(new_record.nr,tmp);
+ if (!maria_update(file,record.id,new_record.id))
+ update++;
+ else
+ {
+ if (my_errno != HA_ERR_RECORD_CHANGED &&
+ my_errno != HA_ERR_RECORD_DELETED &&
+ my_errno != HA_ERR_FOUND_DUPP_KEY)
+ {
+ fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno);
+ return 1;
+ }
+ }
+ }
+ if (lock)
+ {
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno);
+ return 1;
+ }
+ }
+ printf("%2d: update: %5d\n",id,update); fflush(stdout);
+ return 0;
+}
+
+#else /* __NETWARE__ */
+
+#include <stdio.h>
+
+main()
+{
+ fprintf(stderr,"this test has not been ported to NetWare\n");
+ return 0;
+}
+
+#endif /* __NETWARE__ */
diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res
new file mode 100644
index 00000000000..57b0feeeae8
--- /dev/null
+++ b/storage/maria/ma_test_all.res
@@ -0,0 +1,62 @@
+Running tests with dynamic row format
+Running tests with static row format
+Running tests with block row format
+ma_test2 -s -L -K -R1 -m2000 ; Should give error 135
+Error: 135 in write at record: 1099
+got error: 135 when using MARIA-database
+./maria_chk -sm test2 will warn that 'Datafile is almost full'
+maria_chk: MARIA file test2
+maria_chk: warning: Datafile is almost full, 65516 of 65534 used
+MARIA-table 'test2' is usable but should be fixed
+
+real 0m0.808s
+user 0m0.584s
+sys 0m0.212s
+
+real 0m0.780s
+user 0m0.584s
+sys 0m0.176s
+
+real 0m0.809s
+user 0m0.616s
+sys 0m0.180s
+
+real 0m1.356s
+user 0m1.140s
+sys 0m0.188s
+
+real 0m0.783s
+user 0m0.600s
+sys 0m0.176s
+
+real 0m1.390s
+user 0m1.184s
+sys 0m0.152s
+
+real 0m1.875s
+user 0m1.632s
+sys 0m0.244s
+
+real 0m1.313s
+user 0m1.148s
+sys 0m0.160s
+
+real 0m1.846s
+user 0m1.644s
+sys 0m0.188s
+
+real 0m1.875s
+user 0m1.632s
+sys 0m0.212s
+
+real 0m1.819s
+user 0m1.672s
+sys 0m0.124s
+
+real 0m2.117s
+user 0m1.816s
+sys 0m0.292s
+
+real 0m1.871s
+user 0m1.636s
+sys 0m0.196s
diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh
new file mode 100755
index 00000000000..108dffd7df7
--- /dev/null
+++ b/storage/maria/ma_test_all.sh
@@ -0,0 +1,245 @@
+#!/bin/sh
+#
+# Execute some simple basic test on MyISAM libary to check if things
+# works at all.
+
+# If you want to run this in Valgrind, you should use --trace-children=yes,
+# so that it detects problems in ma_test* and not in the shell script
+
+# Running in a "shared memory" disk is 10 times faster; you can do
+# mkdir /dev/shm/test; cd /dev/shm/test; maria_path=<path_to_maria_binaries>
+
+# Remove # from following line if you need some more information
+#set -x -v -e
+
+set -e # abort at first failure
+
+valgrind="valgrind --alignment=8 --leak-check=yes"
+silent="-s"
+suffix=""
+if [ -z "$maria_path" ]
+then
+ maria_path="."
+fi
+
+# Delete temporary files
+rm -f *.TMD
+rm -f maria_log*
+
+run_tests()
+{
+ row_type=$1
+ #
+ # First some simple tests
+ #
+ $maria_path/ma_test1$suffix $silent $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test1$suffix $silent -N $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test1$suffix $silent -P --checksum $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test1$suffix $silent -P -N $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test1$suffix $silent -B -N -R2 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -k 480 --unique $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -N -R1 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p -N --unique $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p -N --key_length=127 --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p -N --key_length=128 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -B $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -B --key_length=64 --unique $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -B -k 480 --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -B -k 480 -N --unique --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -m $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -m -P --unique --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -m -P --key_length=480 --key_cache $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -m -p $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -w --unique $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -w --key_length=64 --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -w -N --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -w --key_length=480 --checksum $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -b -N $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -a -b --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent -p -B --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent --checksum --unique $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test1$suffix $silent --unique $row_type
+ $maria_path/maria_chk$suffix -se test1
+
+ $maria_path/ma_test1$suffix $silent --key_multiple -N -S $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent --key_multiple -a -p --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent --key_multiple -a -B --key_length=480 $row_type
+ $maria_path/maria_chk$suffix -sm test1
+ $maria_path/ma_test1$suffix $silent --key_multiple -P -S $row_type
+ $maria_path/maria_chk$suffix -sm test1
+
+ $maria_path/maria_pack$suffix --force -s test1
+ $maria_path/maria_chk$suffix -ess test1
+
+ $maria_path/ma_test2$suffix $silent -L -K -W -P $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -L -K -W -P -A $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -L -K -P -R3 -m50 -b1000000 $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -L -B $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -D -B -c $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -m10000 -e4096 -K $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -m10000 -e8192 -K $row_type
+ $maria_path/maria_chk$suffix -sm test2
+ $maria_path/ma_test2$suffix $silent -m10000 -e16384 -E16384 -K -L $row_type
+ $maria_path/maria_chk$suffix -sm test2
+}
+
+run_repair_tests()
+{
+ row_type=$1
+ $maria_path/ma_test1$suffix $silent --checksum $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -rs test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -rqs test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -rs --correct-checksum test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -rqs --correct-checksum test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -ros --correct-checksum test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -rqos --correct-checksum test1
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/ma_test2$suffix $silent -c -d1 $row_type
+ $maria_path/maria_chk$suffix -s --parallel-recover test2
+ $maria_path/maria_chk$suffix -se test2
+ $maria_path/maria_chk$suffix -s --parallel-recover --quick test2
+ $maria_path/maria_chk$suffix -se test2
+ $maria_path/ma_test2$suffix $silent -c $row_type
+ $maria_path/maria_chk$suffix -se test2
+ $maria_path/maria_chk$suffix -sr test2
+ $maria_path/maria_chk$suffix -se test2
+}
+
+run_pack_tests()
+{
+ row_type=$1
+ # check of maria_pack / maria_chk
+ $maria_path/ma_test1$suffix $silent --checksum $row_type
+ $maria_path/maria_pack$suffix --force -s test1
+ $maria_path/maria_chk$suffix -ess test1
+ $maria_path/maria_chk$suffix -rqs test1
+ $maria_path/maria_chk$suffix -es test1
+ $maria_path/maria_chk$suffix -rs test1
+ $maria_path/maria_chk$suffix -es test1
+ $maria_path/maria_chk$suffix -rus test1
+ $maria_path/maria_chk$suffix -es test1
+
+ $maria_path/ma_test1$suffix $silent --checksum -S $row_type
+ $maria_path/maria_chk$suffix -se test1
+ $maria_path/maria_chk$suffix -ros test1
+ $maria_path/maria_chk$suffix -rqs test1
+ $maria_path/maria_chk$suffix -se test1
+
+ $maria_path/maria_pack$suffix --force -s test1
+ $maria_path/maria_chk$suffix -rqs test1
+ $maria_path/maria_chk$suffix -es test1
+ $maria_path/maria_chk$suffix -rus test1
+ $maria_path/maria_chk$suffix -es test1
+
+ $maria_path/ma_test2$suffix $silent -c -d1 $row_type
+ $maria_path/maria_chk$suffix -s --parallel-recover test2
+ $maria_path/maria_chk$suffix -se test2
+ $maria_path/maria_chk$suffix -s --parallel-recover --unpack test2
+ $maria_path/maria_chk$suffix -se test2
+ $maria_path/maria_pack$suffix --force -s test1
+ $maria_path/maria_chk$suffix -s --parallel-recover --unpack test2
+ $maria_path/maria_chk$suffix -se test2
+}
+
+echo "Running tests with dynamic row format"
+run_tests ""
+run_repair_tests ""
+run_pack_tests ""
+
+echo "Running tests with static row format"
+run_tests -S
+run_repair_tests -S
+run_pack_tests -S
+
+echo "Running tests with block row format"
+run_tests -M
+run_repair_tests -M
+run_pack_tests -M
+
+echo "Running tests with block row format and transactions"
+run_tests "-M -T"
+run_repair_tests "-M -T"
+run_pack_tests "-M -T"
+
+#
+# Tests that gives warnings or errors
+#
+
+$maria_path/ma_test2$suffix $silent -L -K -W -P -S -R1 -m500
+$maria_path/maria_chk$suffix -sm test2
+echo "ma_test2$suffix $silent -L -K -R1 -m2000 ; Should give error 135"
+$maria_path/ma_test2$suffix $silent -L -K -R1 -m2000 >ma_test2_message.txt 2>&1 && false # success is failure
+cat ma_test2_message.txt
+grep "Error: 135" ma_test2_message.txt > /dev/null
+echo "$maria_path/maria_chk$suffix -sm test2 will warn that 'Datafile is almost full'"
+$maria_path/maria_chk$suffix -sm test2 >ma_test2_message.txt 2>&1
+cat ma_test2_message.txt
+grep "warning: Datafile is almost full" ma_test2_message.txt >/dev/null
+rm -f ma_test2_message.txt
+$maria_path/maria_chk$suffix -ssm test2
+
+#
+# Test that removing tables and applying the log leads to identical tables
+#
+/bin/sh $maria_path/ma_test_recovery
+
+#
+# Some timing tests
+#
+time $maria_path/ma_test2$suffix $silent
+time $maria_path/ma_test2$suffix $silent -S
+time $maria_path/ma_test2$suffix $silent -M
+time $maria_path/ma_test2$suffix $silent -B
+time $maria_path/ma_test2$suffix $silent -L
+time $maria_path/ma_test2$suffix $silent -K
+time $maria_path/ma_test2$suffix $silent -K -B
+time $maria_path/ma_test2$suffix $silent -L -B
+time $maria_path/ma_test2$suffix $silent -L -K -B
+time $maria_path/ma_test2$suffix $silent -L -K -W -B
+time $maria_path/ma_test2$suffix $silent -L -K -W -B -S
+time $maria_path/ma_test2$suffix $silent -L -K -W -B -M
+time $maria_path/ma_test2$suffix $silent -D -K -W -B -S
diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery
new file mode 100755
index 00000000000..7c45af1e206
--- /dev/null
+++ b/storage/maria/ma_test_recovery
@@ -0,0 +1,210 @@
+#!/bin/sh
+
+set -e
+silent="-s"
+if [ -z "$maria_path" ]
+then
+ maria_path="."
+fi
+
+# test data is always put in the current directory or a tmp subdirectory of it
+tmp="./tmp"
+
+if test '!' -d $tmp
+then
+ mkdir $tmp
+fi
+
+echo "MARIA RECOVERY TESTS"
+
+check_table_is_same()
+{
+ # Computes checksum of new table and compares to checksum of old table
+ # Shows any difference in table's state (info from the index's header)
+
+ $maria_path/maria_chk -dvv $table | grep -v "Creation time:" > $tmp/maria_chk_message.txt 2>&1
+
+ # save the index file (because we want to test idempotency afterwards)
+ cp $table.MAI tmp/
+ # In the repair below it's good to use -q because it will die on any
+ # incorrectness of the data file if UNDO was badly applied.
+ # QQ: Remove the following line when we also can recover the index file
+ $maria_path/maria_chk -s -rq $table
+
+ $maria_path/maria_chk -s -e $table
+ checksum2=`$maria_path/maria_chk -dss $table`
+ if test "$checksum" != "$checksum2"
+ then
+ echo "checksum differs for $table before and after recovery"
+ return 1;
+ fi
+
+ diff $tmp/maria_chk_message.good.txt $tmp/maria_chk_message.txt > $tmp/maria_chk_diff.txt || true
+ if [ -s $tmp/maria_chk_diff.txt ]
+ then
+ echo "Differences in maria_chk -dvv, recovery not yet perfect !"
+ echo "========DIFF START======="
+ cat $tmp/maria_chk_diff.txt
+ echo "========DIFF END======="
+ fi
+ mv tmp/$table.MAI .
+}
+
+apply_log()
+{
+ # applies log, can verify if applying did write to log or not
+
+ shouldchangelog=$1
+ if [ "$shouldchangelog" != "shouldnotchangelog" ] &&
+ [ "$shouldchangelog" != "shouldchangelog" ] &&
+ [ "$shouldchangelog" != "dontknow" ]
+ then
+ echo "bad argument '$shouldchangelog'"
+ return 1
+ fi
+ log_md5=`md5sum maria_log.*`
+ echo "applying log"
+ $maria_path/maria_read_log -a > $tmp/maria_read_log_$table.txt
+ log_md5_2=`md5sum maria_log.*`
+ if [ "$log_md5" != "$log_md5_2" ]
+ then
+ if [ "$shouldchangelog" == "shouldnotchangelog" ]
+ then
+ echo "maria_read_log should not have modified the log"
+ return 1
+ fi
+ else
+ if [ "$shouldchangelog" == "shouldchangelog" ]
+ then
+ echo "maria_read_log should have modified the log"
+ return 1
+ fi
+ fi
+}
+
+# To not flood the screen, we redirect all the commands below to a text file
+# and just give a final error if their output is not as expected
+
+(
+
+# this message is to remember about the problem with -b (see @todo below)
+echo "!!!!!!!! REMEMBER to FIX this BLOB issue !!!!!!!"
+
+echo "Testing the REDO PHASE ALONE"
+# runs a program inserting/deleting rows, then moves the resulting table
+# elsewhere; applies the log and checks that the data file is
+# identical to the saved original.
+# Does not test the index file as we don't have logging for it yet.
+
+set -- "ma_test1 $silent -M -T -c" "ma_test2 $silent -L -K -W -P -M -T -c" "ma_test2 $silent -M -T -c -b"
+while [ $# != 0 ]
+do
+ prog=$1
+ rm -f maria_log.* maria_log_control
+ echo "TEST WITH $prog"
+ $maria_path/$prog
+ # derive table's name from program's name
+ table=`echo $prog | sed -e 's;.*ma_\(test[0-9]\).*;\1;' `
+ $maria_path/maria_chk -dvv $table | grep -v "Creation time:"> $tmp/maria_chk_message.good.txt 2>&1
+ checksum=`$maria_path/maria_chk -dss $table`
+ mv $table.MAD $tmp/$table.MAD.good
+ rm $table.MAI
+ apply_log "shouldnotchangelog"
+ cmp $table.MAD $tmp/$table.MAD.good
+ check_table_is_same
+ echo "testing idempotency"
+ apply_log "shouldnotchangelog"
+ cmp $table.MAD $tmp/$table.MAD.good
+ check_table_is_same
+ shift
+done
+
+echo "Testing the REDO AND UNDO PHASE"
+# The test programs look like:
+# work; commit (time T1); work; exit-without-commit (time T2)
+# We first run the test program and let it exit after T1's commit.
+# Then we run it again and let it exit at T2. Then we compare
+# and expect identity.
+
+for blobs in "" "-b" # we test table without blobs and then table with blobs
+do
+ for test_undo in 1 2 3
+ do
+ # first iteration tests rollback of insert, second tests rollback of delete
+ set -- "ma_test1 $silent -M -T -c -N $blobs" "--testflag=1" "--testflag=2 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=3" "--testflag=4 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=2" "--testflag=3 --test-undo=" "ma_test2 $silent -L -K -W -P -M -T -c $blobs" "-t1" "-t2 -u"
+ # -N (create NULL fields) is needed because --test-undo adds it anyway
+ while [ $# != 0 ]
+ do
+ prog=$1
+ commit_run_args=$2
+ abort_run_args=$3;
+ rm -f maria_log.* maria_log_control
+ echo "TEST WITH $prog $commit_run_args (commit at end)"
+ $maria_path/$prog $commit_run_args
+ # derive table's name from program's name
+ table=`echo $prog | sed -e 's;.*ma_\(test[0-9]\).*;\1;' `
+ $maria_path/maria_chk -dvv $table | grep -v "Creation time:"> $tmp/maria_chk_message.good.txt 2>&1
+ checksum=`$maria_path/maria_chk -dss $table`
+ mv $table.MAD $tmp/$table.MAD.good
+ rm $table.MAI
+ rm maria_log.* maria_log_control
+ echo "TEST WITH $prog $abort_run_args$test_undo (additional aborted work)"
+ $maria_path/$prog $abort_run_args$test_undo
+ cp $table.MAD $tmp/$table.MAD.before_undo
+ if [ $test_undo -lt 3 ]
+ then
+ apply_log "shouldchangelog" # should undo aborted work
+ else
+ # probably nothing to undo went to log or data file
+ apply_log "dontknow"
+ fi
+ cp $table.MAD $tmp/$table.MAD.after_undo
+
+ # It is impossible to do a "cmp" between .good and .after_undo,
+ # because the UNDO phase generated log
+ # records whose LSN tagged pages. Another reason is that rolling back
+ # INSERT only marks the rows free, does not empty them (optimization), so
+ # traces of the INSERT+rollback remain.
+
+ check_table_is_same
+ echo "testing idempotency"
+ apply_log "shouldnotchangelog"
+ cmp $table.MAD $tmp/$table.MAD.after_undo
+ check_table_is_same
+ echo "testing applying of CLRs to recreate table"
+ rm $table.MA?
+ apply_log "shouldnotchangelog"
+ # the cmp below fails with ma_test1+blobs! @todo RECOVERY BUG why?
+ # It is probably serious; REDOs shouldn't place rows in different
+ # positions from what the run-time code did. Indeed it may lead to
+ # more or less free space...
+ # Execution of UNDO re-inserted rows at different positions than
+ # originally. This generated REDOs which do not insert at the same
+ # positions as the execution of UNDOs, but at the same positions
+ # as before the row was originally deleted.
+ if [ "$blobs" == "" ]
+ then
+ cmp $table.MAD $tmp/$table.MAD.after_undo
+ fi
+ check_table_is_same
+ shift 3
+ done
+ rm -f $table.* $tmp/$table* $tmp/maria_chk_*.txt $tmp/maria_read_log_$table.txt
+done
+done
+
+) 2>&1 > $tmp/ma_test_recovery.output
+
+# also note that maria_chk -dvv shows differences for ma_test2 in UNDO phase,
+# this is normal: removing records does not shrink the data/key file,
+# does not put back the "analyzed,optimized keys"(etc) index state.
+diff $maria_path/ma_test_recovery.expected $tmp/ma_test_recovery.output > /dev/null || diff_failed=1
+if [ "$diff_failed" == "1" ]
+ then
+ echo "UNEXPECTED OUTPUT OF TESTS, FAILED"
+ echo "For more info, do diff $maria_path/ma_test_recovery.expected $tmp/ma_test_recovery.output"
+ exit 1
+ fi
+echo "ALL RECOVERY TESTS OK"
+# this message is to remember about the problem with -b (see @todo above)
+echo "!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!!"
diff --git a/storage/maria/ma_test_recovery.expected b/storage/maria/ma_test_recovery.expected
new file mode 100644
index 00000000000..926943b11b3
--- /dev/null
+++ b/storage/maria/ma_test_recovery.expected
@@ -0,0 +1,1123 @@
+!!!!!!!! REMEMBER to FIX this BLOB issue !!!!!!!
+Testing the REDO PHASE ALONE
+TEST WITH ma_test1 -s -M -T -c
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 3757530372
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number 1 8192 8192
+---
+> 1 2 6 unique number 1 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 3757530372
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number 1 8192 8192
+---
+> 1 2 6 unique number 1 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+11c11
+< Datafile length: 90112 Keyfile length: 204800
+---
+> Datafile length: 90112 Keyfile length: 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+11c11
+< Datafile length: 90112 Keyfile length: 204800
+---
+> Datafile length: 90112 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -M -T -c -b
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+11c11
+< Datafile length: 81920 Keyfile length: 172032
+---
+> Datafile length: 81920 Keyfile length: 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+11c11
+< Datafile length: 81920 Keyfile length: 172032
+---
+> Datafile length: 81920 Keyfile length: 8192
+========DIFF END=======
+Testing the REDO AND UNDO PHASE
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 3697324514
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 3026590807
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 3026590807
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 3697324514
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 3026590807
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 3026590807
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 221293111
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 3697324514
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 2428948025
+---
+> Checksum: 0
+11c11
+< Datafile length: 16384 Keyfile length: 16384
+---
+> Datafile length: 16384 Keyfile length: 8192
+18c18
+< 1 2 6 unique number NULL 0 8192 8192
+---
+> 1 2 6 unique number NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 204800
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 90112 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 4024695312
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 800025671
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 800025671
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 4024695312
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 800025671
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 800025671
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 411409161
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 4024695312
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+7c7
+< Checksum: 529753687
+---
+> Checksum: 0
+11c11
+< Datafile length: 49152 Keyfile length: 16384
+---
+> Datafile length: 49152 Keyfile length: 8192
+18c18
+< 1 2 6 unique varchar BLOB NULL 0 8192 8192
+---
+> 1 2 6 unique varchar BLOB NULL 0 8192
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing idempotency
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 212992
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in maria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages
+---
+> Status: changed
+11c11
+< Datafile length: 8192 Keyfile length: 8192
+---
+> Datafile length: 81920 Keyfile length: 8192
+========DIFF END=======
diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c
new file mode 100644
index 00000000000..3ab717887c7
--- /dev/null
+++ b/storage/maria/ma_unique.c
@@ -0,0 +1,235 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Functions to check if a row is unique */
+
+#include "maria_def.h"
+#include <m_ctype.h>
+
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, uchar *record,
+ ha_checksum unique_hash, my_off_t disk_pos)
+{
+ my_off_t lastpos=info->cur_row.lastpos;
+ MARIA_KEYDEF *key= &info->s->keyinfo[def->key];
+ uchar *key_buff= info->lastkey2;
+ DBUG_ENTER("_ma_check_unique");
+ DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash));
+
+ maria_unique_store(record+key->seg->start, unique_hash);
+ _ma_make_key(info,def->key,key_buff,record,0);
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ if (_ma_search(info,info->s->keyinfo+def->key,key_buff,
+ MARIA_UNIQUE_HASH_LENGTH,
+ SEARCH_FIND,info->s->state.key_root[def->key]))
+ {
+ info->page_changed=1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_RETURN(0); /* No matching rows */
+ }
+
+ for (;;)
+ {
+ if (info->cur_row.lastpos != disk_pos &&
+ !(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos))
+ {
+ my_errno=HA_ERR_FOUND_DUPP_UNIQUE;
+ info->errkey= (int) def->key;
+ info->dup_key_pos= info->cur_row.lastpos;
+ info->page_changed= 1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_PRINT("info",("Found duplicate"));
+ DBUG_RETURN(1); /* Found identical */
+ }
+ if (_ma_search_next(info,info->s->keyinfo+def->key, info->lastkey,
+ MARIA_UNIQUE_HASH_LENGTH, SEARCH_BIGGER,
+ info->s->state.key_root[def->key]) ||
+ bcmp((char*) info->lastkey, (char*) key_buff,
+ MARIA_UNIQUE_HASH_LENGTH))
+ {
+ info->page_changed= 1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_RETURN(0); /* end of tree */
+ }
+ }
+}
+
+
+/*
+ Calculate a hash for a row
+
+ TODO
+ Add support for bit fields
+*/
+
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record)
+{
+ const uchar *pos, *end;
+ ha_checksum crc= 0;
+ ulong seed1=0, seed2= 4;
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint length=keyseg->length;
+
+ if (keyseg->null_bit)
+ {
+ if (record[keyseg->null_pos] & keyseg->null_bit)
+ {
+ /*
+ Change crc in a way different from an empty string or 0.
+ (This is an optimisation; The code will work even if this isn't
+ done)
+ */
+ crc=((crc << 8) + 511+
+ (crc >> (8*sizeof(ha_checksum)-8)));
+ continue;
+ }
+ }
+ pos= record+keyseg->start;
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= keyseg->bit_start;
+ uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos :
+ uint2korr(pos));
+ pos+= pack_length; /* Skip VARCHAR length */
+ set_if_smaller(length,tmp_length);
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+ memcpy_fixed((uchar*) &pos,pos+keyseg->bit_start,sizeof(char*));
+ if (!length || length > tmp_length)
+ length=tmp_length; /* The whole blob */
+ }
+ end= pos+length;
+ if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+ type == HA_KEYTYPE_VARTEXT2)
+ {
+ keyseg->charset->coll->hash_sort(keyseg->charset,
+ (const uchar*) pos, length, &seed1,
+ &seed2);
+ crc^= seed1;
+ }
+ else
+ while (pos != end)
+ crc=((crc << 8) +
+ (((uchar) *(uchar*) pos++))) +
+ (crc >> (8*sizeof(ha_checksum)-8));
+ }
+ return crc;
+}
+
+
+/*
+ compare unique key for two rows
+
+ TODO
+ Add support for bit fields
+
+ RETURN
+ 0 if both rows have equal unique value
+ 1 Rows are different
+*/
+
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+ my_bool null_are_equal)
+{
+ const uchar *pos_a, *pos_b, *end;
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint a_length, b_length;
+ a_length= b_length= keyseg->length;
+
+ /* If part is NULL it's regarded as different */
+ if (keyseg->null_bit)
+ {
+ uint tmp;
+ if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) !=
+ (uint) (b[keyseg->null_pos] & keyseg->null_bit))
+ return 1;
+ if (tmp)
+ {
+ if (!null_are_equal)
+ return 1;
+ continue;
+ }
+ }
+ pos_a= a+keyseg->start;
+ pos_b= b+keyseg->start;
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= keyseg->bit_start;
+ if (pack_length == 1)
+ {
+ a_length= (uint) *(uchar*) pos_a++;
+ b_length= (uint) *(uchar*) pos_b++;
+ }
+ else
+ {
+ a_length= uint2korr(pos_a);
+ b_length= uint2korr(pos_b);
+ pos_a+= 2; /* Skip VARCHAR length */
+ pos_b+= 2;
+ }
+ set_if_smaller(a_length, keyseg->length); /* Safety */
+ set_if_smaller(b_length, keyseg->length); /* safety */
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ /* Only compare 'length' characters if length != 0 */
+ a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a);
+ b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b);
+ /* Check that a and b are of equal length */
+ if (keyseg->length)
+ {
+ /*
+ This is used in some cases when we are not interested in comparing
+ the whole length of the blob.
+ */
+ set_if_smaller(a_length, keyseg->length);
+ set_if_smaller(b_length, keyseg->length);
+ }
+ memcpy_fixed((uchar*) &pos_a,pos_a+keyseg->bit_start,sizeof(char*));
+ memcpy_fixed((uchar*) &pos_b,pos_b+keyseg->bit_start,sizeof(char*));
+ }
+ if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+ type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (ha_compare_text(keyseg->charset, (uchar *) pos_a, a_length,
+ (uchar *) pos_b, b_length, 0, 1))
+ return 1;
+ }
+ else
+ {
+ if (a_length != b_length)
+ return 1;
+ end= pos_a+a_length;
+ while (pos_a != end)
+ {
+ if (*pos_a++ != *pos_b++)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c
new file mode 100644
index 00000000000..0cb2e2b648b
--- /dev/null
+++ b/storage/maria/ma_update.c
@@ -0,0 +1,250 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Update an old row in a MARIA table */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+
+int maria_update(register MARIA_HA *info, const uchar *oldrec, uchar *newrec)
+{
+ int flag,key_changed,save_errno;
+ reg3 my_off_t pos;
+ uint i;
+ uchar old_key[HA_MAX_KEY_BUFF],*new_key;
+ bool auto_key_changed=0;
+ ulonglong changed;
+ MARIA_SHARE *share=info->s;
+ ha_checksum old_checksum;
+ DBUG_ENTER("maria_update");
+ LINT_INIT(new_key);
+ LINT_INIT(changed);
+ LINT_INIT(old_checksum);
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ if (!(info->update & HA_STATE_AKTIV))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND);
+ }
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (info->state->key_file_length >= share->base.margin_key_file_length)
+ {
+ DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL);
+ }
+ pos= info->cur_row.lastpos;
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+
+ if ((*share->compare_record)(info,oldrec))
+ {
+ save_errno= my_errno;
+ DBUG_PRINT("warning", ("Got error from compare record"));
+ goto err_end; /* Record has changed */
+ }
+
+ if (share->calc_checksum)
+ {
+ /*
+ We can't use the row based checksum as this doesn't have enough
+ precision.
+ */
+ if (info->s->calc_checksum)
+ old_checksum= (*info->s->calc_checksum)(info, oldrec);
+ }
+
+ /* Calculate and check all unique constraints */
+ key_changed=0;
+ for (i=0 ; i < share->state.header.uniques ; i++)
+ {
+ MARIA_UNIQUEDEF *def=share->uniqueinfo+i;
+ if (_ma_unique_comp(def, newrec, oldrec,1) &&
+ _ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec),
+ pos))
+ {
+ save_errno=my_errno;
+ goto err_end;
+ }
+ }
+ if (_ma_mark_file_changed(info))
+ {
+ save_errno=my_errno;
+ goto err_end;
+ }
+
+ /* Check which keys changed from the original row */
+
+ new_key= info->lastkey2;
+ changed=0;
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ if (share->keyinfo[i].flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_cmp(info,i,oldrec, newrec))
+ {
+ if ((int) i == info->lastinx)
+ {
+ /*
+ We are changeing the index we are reading on. Mark that
+ the index data has changed and we need to do a full search
+ when doing read-next
+ */
+ key_changed|=HA_STATE_WRITTEN;
+ }
+ changed|=((ulonglong) 1 << i);
+ if (_ma_ft_update(info,i,(char*) old_key,oldrec,newrec,pos))
+ goto err;
+ }
+ }
+ else
+ {
+ uint new_length= _ma_make_key(info,i,new_key,newrec,pos);
+ uint old_length= _ma_make_key(info,i,old_key,oldrec,pos);
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ if (new_length != old_length ||
+ memcmp(old_key, new_key, new_length))
+ {
+ if ((int) i == info->lastinx)
+ key_changed|=HA_STATE_WRITTEN; /* Mark that keyfile changed */
+ changed|=((ulonglong) 1 << i);
+ share->keyinfo[i].version++;
+ if (share->keyinfo[i].ck_delete(info,i,old_key,old_length)) goto err;
+ if (share->keyinfo[i].ck_insert(info,i,new_key,new_length)) goto err;
+ if (share->base.auto_key == i+1)
+ auto_key_changed=1;
+ }
+ }
+ }
+ }
+ /*
+ If we are running with external locking, we must update the index file
+ that something has changed.
+ */
+ if (changed || !my_disable_locking)
+ key_changed|= HA_STATE_CHANGED;
+
+ if (share->calc_checksum)
+ {
+ info->cur_row.checksum= (*share->calc_checksum)(info,newrec);
+ info->state->checksum+= (info->cur_row.checksum - old_checksum);
+ /* Store new checksum in index file header */
+ key_changed|= HA_STATE_CHANGED;
+ }
+ {
+ /*
+ Don't update index file if data file is not extended and no status
+ information changed
+ */
+ MARIA_STATUS_INFO state;
+ ha_rows org_split;
+ my_off_t org_delete_link;
+
+ memcpy((char*) &state, (char*) info->state, sizeof(state));
+ org_split= share->state.split;
+ org_delete_link= share->state.dellink;
+ if ((*share->update_record)(info, pos, oldrec, newrec))
+ goto err;
+ if (!key_changed &&
+ (memcmp((char*) &state, (char*) info->state, sizeof(state)) ||
+ org_split != share->state.split ||
+ org_delete_link != share->state.dellink))
+ key_changed|= HA_STATE_CHANGED; /* Must update index file */
+ }
+ if (auto_key_changed)
+ set_if_bigger(info->s->state.auto_increment,
+ ma_retrieve_auto_increment(info, newrec));
+
+ /*
+ We can't yet have HA_STATE_AKTIV here, as block_record dosn't support
+ it
+ */
+ info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed);
+
+ /*
+ Every Maria function that updates Maria table must end with
+ call to _ma_writeinfo(). If operation (second param of
+ _ma_writeinfo()) is not 0 it sets share->changed to 1, that is
+ flags that data has changed. If operation is 0, this function
+ equals to no-op in this case.
+
+ ma_update() must always pass !0 value as operation, since even if
+ there is no index change there could be data change.
+ */
+ VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (update)", info->s->open_file_name));
+ (*info->invalidator)(info->s->open_file_name);
+ info->invalidator=0;
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("error",("key: %d errno: %d",i,my_errno));
+ save_errno=my_errno;
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM ||
+ my_errno == HA_ERR_RECORD_FILE_FULL)
+ {
+ info->errkey= (int) i;
+ flag=0;
+ do
+ {
+ if (((ulonglong) 1 << i) & changed)
+ {
+ if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if ((flag++ && _ma_ft_del(info,i,(char*) new_key,newrec,pos)) ||
+ _ma_ft_add(info,i,(char*) old_key,oldrec,pos))
+ break;
+ }
+ else
+ {
+ uint new_length= _ma_make_key(info,i,new_key,newrec,pos);
+ uint old_length= _ma_make_key(info,i,old_key,oldrec,pos);
+ if ((flag++ && _ma_ck_delete(info,i,new_key,new_length)) ||
+ _ma_ck_write(info,i,old_key,old_length))
+ break;
+ }
+ }
+ } while (i-- != 0);
+ }
+ else
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED |
+ key_changed);
+
+ err_end:
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (save_errno == HA_ERR_KEY_NOT_FOUND)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ save_errno=HA_ERR_CRASHED;
+ }
+ DBUG_RETURN(my_errno=save_errno);
+} /* maria_update */
diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c
new file mode 100644
index 00000000000..b034d71ef9d
--- /dev/null
+++ b/storage/maria/ma_write.c
@@ -0,0 +1,1102 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Write a row to a MARIA table */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+
+#define MAX_POINTER_LENGTH 8
+
+ /* Functions declared in this file */
+
+static int w_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo,
+ uint comp_flag, uchar *key,
+ uint key_length, my_off_t pos, uchar *father_buff,
+ uchar *father_keypos, my_off_t father_page,
+ my_bool insert_last);
+static int _ma_balance_page(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key,
+ uchar *curr_buff,uchar *father_buff,
+ uchar *father_keypos,my_off_t father_page);
+static uchar *_ma_find_last_pos(MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint *return_key_length,
+ uchar **after_key);
+int _ma_ck_write_tree(register MARIA_HA *info, uint keynr,uchar *key,
+ uint key_length);
+int _ma_ck_write_btree(register MARIA_HA *info, uint keynr,uchar *key,
+ uint key_length);
+
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info,
+ const uchar *record
+ __attribute__((unused)))
+{
+ return ((info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end) ?
+ info->s->state.dellink :
+ info->state->data_file_length);
+}
+
+my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/* Write new record to a table */
+
+int maria_write(MARIA_HA *info, uchar *record)
+{
+ MARIA_SHARE *share=info->s;
+ uint i;
+ int save_errno;
+ MARIA_RECORD_POS filepos;
+ uchar *buff;
+ my_bool lock_tree= share->concurrent_insert;
+ my_bool fatal_error;
+ DBUG_ENTER("maria_write");
+ DBUG_PRINT("enter",("index_file: %d data_file: %d",
+ info->s->kfile.file, info->dfile.file));
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (share->base.reloc == (ha_rows) 1 &&
+ share->base.records == (ha_rows) 1 &&
+ info->state->records == (ha_rows) 1)
+ { /* System file */
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ goto err2;
+ }
+ if (info->state->key_file_length >= share->base.margin_key_file_length)
+ {
+ my_errno=HA_ERR_INDEX_FILE_FULL;
+ goto err2;
+ }
+ if (_ma_mark_file_changed(info))
+ goto err2;
+
+ /* Calculate and check all unique constraints */
+ for (i=0 ; i < share->state.header.uniques ; i++)
+ {
+ if (_ma_check_unique(info,share->uniqueinfo+i,record,
+ _ma_unique_hash(share->uniqueinfo+i,record),
+ HA_OFFSET_ERROR))
+ goto err2;
+ }
+
+ if ((info->opt_flag & OPT_NO_ROWS))
+ filepos= HA_OFFSET_ERROR;
+ else
+ {
+ /*
+ This may either calculate a record or, or write the record and return
+ the record id
+ */
+ if ((filepos= (*share->write_record_init)(info, record)) ==
+ HA_OFFSET_ERROR)
+ goto err2;
+ }
+
+ /* Write all keys to indextree */
+ buff= info->lastkey2;
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ bool local_lock_tree= (lock_tree &&
+ !(info->bulk_insert &&
+ is_tree_inited(&info->bulk_insert[i])));
+ if (local_lock_tree)
+ {
+ rw_wrlock(&share->key_root_lock[i]);
+ share->keyinfo[i].version++;
+ }
+ if (share->keyinfo[i].flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_add(info,i,(char*) buff,record,filepos))
+ {
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ DBUG_PRINT("error",("Got error: %d on write",my_errno));
+ goto err;
+ }
+ }
+ else
+ {
+ if (share->keyinfo[i].ck_insert(info,i,buff,
+ _ma_make_key(info,i,buff,record,
+ filepos)))
+ {
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ DBUG_PRINT("error",("Got error: %d on write",my_errno));
+ goto err;
+ }
+ }
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ }
+ }
+ /**
+ @todo RECOVERY BUG
+ this += must happen under log's mutex when writing the UNDO
+ */
+ if (share->calc_write_checksum)
+ info->cur_row.checksum= (*share->calc_write_checksum)(info,record);
+ if (filepos != HA_OFFSET_ERROR)
+ {
+ if ((*share->write_record)(info,record))
+ goto err;
+ /**
+ @todo when we enable multiple writers, we will have to protect
+ 'records' and 'checksum' somehow.
+ */
+ info->state->checksum+= info->cur_row.checksum;
+ }
+ if (share->base.auto_key)
+ set_if_bigger(info->s->state.auto_increment,
+ ma_retrieve_auto_increment(info, record));
+ info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN |
+ HA_STATE_ROW_CHANGED);
+ info->state->records+= !share->now_transactional; /*otherwise already done*/
+ info->cur_row.lastpos= filepos;
+ VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (update)", info->s->open_file_name));
+ (*info->invalidator)(info->s->open_file_name);
+ info->invalidator=0;
+ }
+
+ /*
+ Update status of the table. We need to do so after each row write
+ for the log tables, as we want the new row to become visible to
+ other threads as soon as possible. We don't lock mutex here
+ (as it is required by pthread memory visibility rules) as (1) it's
+ not critical to use outdated share->is_log_table value (2) locking
+ mutex here for every write is too expensive.
+ */
+ if (share->is_log_table)
+ _ma_update_status((void*) info);
+
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(0);
+
+err:
+ save_errno= my_errno;
+ fatal_error= 0;
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY ||
+ my_errno == HA_ERR_RECORD_FILE_FULL ||
+ my_errno == HA_ERR_NULL_IN_SPATIAL ||
+ my_errno == HA_ERR_OUT_OF_MEM)
+ {
+ if (info->bulk_insert)
+ {
+ uint j;
+ for (j=0 ; j < share->base.keys ; j++)
+ maria_flush_bulk_insert(info, j);
+ }
+ info->errkey= (int) i;
+ /*
+ We delete keys in the reverse order of insertion. This is the order that
+ a rollback would do and is important for CLR_ENDs generated by
+ _ma_ft|ck_delete() and write_record_abort() to work (with any other
+ order they would cause wrong jumps in the chain).
+ */
+ while ( i-- > 0)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ bool local_lock_tree= (lock_tree &&
+ !(info->bulk_insert &&
+ is_tree_inited(&info->bulk_insert[i])));
+ if (local_lock_tree)
+ rw_wrlock(&share->key_root_lock[i]);
+ /**
+ @todo RECOVERY BUG
+ The key deletes below should generate CLR_ENDs
+ */
+ if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if (_ma_ft_del(info,i,(char*) buff,record,filepos))
+ {
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ break;
+ }
+ }
+ else
+ {
+ uint key_length= _ma_make_key(info,i,buff,record,filepos);
+ if (_ma_ck_delete(info,i,buff,key_length))
+ {
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ break;
+ }
+ }
+ if (local_lock_tree)
+ rw_unlock(&share->key_root_lock[i]);
+ }
+ }
+ }
+ else
+ fatal_error= 1;
+
+ if ((*share->write_record_abort)(info))
+ fatal_error= 1;
+ if (fatal_error)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+
+ info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED);
+ my_errno=save_errno;
+err2:
+ save_errno=my_errno;
+ DBUG_PRINT("error", ("got error: %d", save_errno));
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(my_errno=save_errno);
+} /* maria_write */
+
+
+ /* Write one key to btree */
+
+int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key, uint key_length)
+{
+ DBUG_ENTER("_ma_ck_write");
+
+ if (info->bulk_insert && is_tree_inited(&info->bulk_insert[keynr]))
+ {
+ DBUG_RETURN(_ma_ck_write_tree(info, keynr, key, key_length));
+ }
+ else
+ {
+ DBUG_RETURN(_ma_ck_write_btree(info, keynr, key, key_length));
+ }
+} /* _ma_ck_write */
+
+
+/**********************************************************************
+ * Normal insert code *
+ **********************************************************************/
+
+int _ma_ck_write_btree(register MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length)
+{
+ int error;
+ uint comp_flag;
+ MARIA_KEYDEF *keyinfo=info->s->keyinfo+keynr;
+ my_off_t *root=&info->s->state.key_root[keynr];
+ DBUG_ENTER("_ma_ck_write_btree");
+
+ if (keyinfo->flag & HA_SORT_ALLOWS_SAME)
+ comp_flag=SEARCH_BIGGER; /* Put after same key */
+ else if (keyinfo->flag & (HA_NOSAME|HA_FULLTEXT))
+ {
+ comp_flag=SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */
+ if (keyinfo->flag & HA_NULL_ARE_EQUAL)
+ comp_flag|= SEARCH_NULL_ARE_EQUAL;
+ }
+ else
+ comp_flag=SEARCH_SAME; /* Keys in rec-pos order */
+
+ error= _ma_ck_real_write_btree(info, keyinfo, key, key_length,
+ root, comp_flag);
+ if (info->ft1_to_ft2)
+ {
+ if (!error)
+ error= _ma_ft_convert_to_ft2(info, keynr, key);
+ delete_dynamic(info->ft1_to_ft2);
+ my_free((uchar*)info->ft1_to_ft2, MYF(0));
+ info->ft1_to_ft2=0;
+ }
+ DBUG_RETURN(error);
+} /* _ma_ck_write_btree */
+
+
+int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length, my_off_t *root,
+ uint comp_flag)
+{
+ int error;
+ DBUG_ENTER("_ma_ck_real_write_btree");
+ /* key_length parameter is used only if comp_flag is SEARCH_FIND */
+ if (*root == HA_OFFSET_ERROR ||
+ (error=w_search(info, keyinfo, comp_flag, key, key_length,
+ *root, (uchar*) 0, (uchar*) 0,
+ (my_off_t) 0, 1)) > 0)
+ error= _ma_enlarge_root(info,keyinfo,key,root);
+ DBUG_RETURN(error);
+} /* _ma_ck_real_write_btree */
+
+
+ /* Make a new root with key as only pointer */
+
+int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ my_off_t *root)
+{
+ uint t_length,nod_flag;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_SHARE *share=info->s;
+ DBUG_ENTER("_ma_enlarge_root");
+
+ nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0;
+ _ma_kpointer(info,info->buff+2,*root); /* if nod */
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,(uchar*) 0,
+ (uchar*) 0, (uchar*) 0, key,&s_temp);
+ maria_putint(info->buff,t_length+2+nod_flag,nod_flag);
+ (*keyinfo->store_key)(keyinfo,info->buff+2+nod_flag,&s_temp);
+ info->keyread_buff_used=info->page_changed=1; /* info->buff is used */
+ if ((*root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR ||
+ _ma_write_keypage(info,keyinfo,*root,DFLT_INIT_HITS,info->buff))
+ DBUG_RETURN(-1);
+ DBUG_RETURN(0);
+} /* _ma_enlarge_root */
+
+
+ /*
+ Search after a position for a key and store it there
+ Returns -1 = error
+ 0 = ok
+ 1 = key should be stored in higher tree
+ */
+
+static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uint comp_flag, uchar *key, uint key_length, my_off_t page,
+ uchar *father_buff, uchar *father_keypos,
+ my_off_t father_page, my_bool insert_last)
+{
+ int error,flag;
+ uint nod_flag, search_key_length;
+ uchar *temp_buff,*keypos;
+ uchar keybuff[HA_MAX_KEY_BUFF];
+ my_bool was_last_key;
+ my_off_t next_page, dup_key_pos;
+ DBUG_ENTER("w_search");
+ DBUG_PRINT("enter",("page: %ld", (long) page));
+
+ search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY;
+ if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ HA_MAX_KEY_BUFF*2)))
+ DBUG_RETURN(-1);
+ if (!_ma_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0))
+ goto err;
+
+ flag=(*keyinfo->bin_search)(info,keyinfo,temp_buff,key,search_key_length,
+ comp_flag, &keypos, keybuff, &was_last_key);
+ nod_flag= _ma_test_if_nod(temp_buff);
+ if (flag == 0)
+ {
+ uint tmp_key_length;
+ /* get position to record with duplicated key */
+ tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,keybuff);
+ if (tmp_key_length)
+ dup_key_pos= _ma_dpos(info,0,keybuff+tmp_key_length);
+ else
+ dup_key_pos= HA_OFFSET_ERROR;
+
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ uint off;
+ int subkeys;
+
+ get_key_full_length_rdonly(off, keybuff);
+ subkeys=ft_sintXkorr(keybuff+off);
+ comp_flag=SEARCH_SAME;
+ if (subkeys >= 0)
+ {
+ /* normal word, one-level tree structure */
+ flag=(*keyinfo->bin_search)(info, keyinfo, temp_buff, key,
+ USE_WHOLE_KEY, comp_flag,
+ &keypos, keybuff, &was_last_key);
+ }
+ else
+ {
+ /* popular word. two-level tree. going down */
+ my_off_t root=dup_key_pos;
+ keyinfo=&info->s->ft2_keyinfo;
+ get_key_full_length_rdonly(off, key);
+ key+=off;
+ keypos-=keyinfo->keylength+nod_flag; /* we'll modify key entry 'in vivo' */
+ error= _ma_ck_real_write_btree(info, keyinfo, key, 0,
+ &root, comp_flag);
+ _ma_dpointer(info, keypos+HA_FT_WLEN, root);
+ subkeys--; /* should there be underflow protection ? */
+ DBUG_ASSERT(subkeys < 0);
+ ft_intXstore(keypos, subkeys);
+ if (!error)
+ error= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff);
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(error);
+ }
+ }
+ else /* not HA_FULLTEXT, normal HA_NOSAME key */
+ {
+ info->dup_key_pos= dup_key_pos;
+ my_afree((uchar*) temp_buff);
+ my_errno=HA_ERR_FOUND_DUPP_KEY;
+ DBUG_RETURN(-1);
+ }
+ }
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ DBUG_RETURN(-1);
+ if (!was_last_key)
+ insert_last=0;
+ next_page= _ma_kpos(nod_flag,keypos);
+ if (next_page == HA_OFFSET_ERROR ||
+ (error=w_search(info, keyinfo, comp_flag, key, key_length, next_page,
+ temp_buff, keypos, page, insert_last)) >0)
+ {
+ error= _ma_insert(info,keyinfo,key,temp_buff,keypos,keybuff,father_buff,
+ father_keypos,father_page, insert_last);
+ if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff))
+ goto err;
+ }
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(error);
+err:
+ my_afree((uchar*) temp_buff);
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN (-1);
+} /* w_search */
+
+
+/*
+ Insert new key.
+
+ SYNOPSIS
+ _ma_insert()
+ info Open table information.
+ keyinfo Key definition information.
+ key New key.
+ anc_buff Key page (beginning).
+ key_pos Position in key page where to insert.
+ key_buff Copy of previous key.
+ father_buff parent key page for balancing.
+ father_key_pos position in parent key page for balancing.
+ father_page position of parent key page in file.
+ insert_last If to append at end of page.
+
+ DESCRIPTION
+ Insert new key at right of key_pos.
+
+ RETURN
+ 2 if key contains key to upper level.
+ 0 OK.
+ < 0 Error.
+*/
+
+int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *key, uchar *anc_buff, uchar *key_pos, uchar *key_buff,
+ uchar *father_buff, uchar *father_key_pos, my_off_t father_page,
+ my_bool insert_last)
+{
+ uint a_length,nod_flag;
+ int t_length;
+ uchar *endpos, *prev_key;
+ MARIA_KEY_PARAM s_temp;
+ DBUG_ENTER("_ma_insert");
+ DBUG_PRINT("enter",("key_pos: 0x%lx", (ulong) key_pos));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key,
+ USE_WHOLE_KEY););
+
+ nod_flag=_ma_test_if_nod(anc_buff);
+ a_length= maria_data_on_page(anc_buff);
+ endpos= anc_buff+ a_length;
+ prev_key=(key_pos == anc_buff+2+nod_flag ? (uchar*) 0 : key_buff);
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,
+ (key_pos == endpos ? (uchar*) 0 : key_pos),
+ prev_key, prev_key,
+ key,&s_temp);
+#ifndef DBUG_OFF
+ if (key_pos != anc_buff+2+nod_flag && (keyinfo->flag &
+ (HA_BINARY_PACK_KEY | HA_PACK_KEY)))
+ {
+ DBUG_DUMP("prev_key",(uchar*) key_buff, _ma_keylength(keyinfo,key_buff));
+ }
+ if (keyinfo->flag & HA_PACK_KEY)
+ {
+ DBUG_PRINT("test",("t_length: %d ref_len: %d",
+ t_length,s_temp.ref_length));
+ DBUG_PRINT("test",("n_ref_len: %d n_length: %d key_pos: 0x%lx",
+ s_temp.n_ref_length, s_temp.n_length, (long) s_temp.key));
+ }
+#endif
+ if (t_length > 0)
+ {
+ if (t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ bmove_upp((uchar*) endpos+t_length,(uchar*) endpos,(uint) (endpos-key_pos));
+ }
+ else
+ {
+ if (-t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length);
+ }
+ (*keyinfo->store_key)(keyinfo,key_pos,&s_temp);
+ a_length+=t_length;
+ maria_putint(anc_buff,a_length,nod_flag);
+ if (a_length <= keyinfo->block_length)
+ {
+ if (keyinfo->block_length - a_length < 32 &&
+ keyinfo->flag & HA_FULLTEXT && key_pos == endpos &&
+ info->s->base.key_reflength <= info->s->base.rec_reflength &&
+ info->s->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))
+ {
+ /*
+ Normal word. One-level tree. Page is almost full.
+ Let's consider converting.
+ We'll compare 'key' and the first key at anc_buff
+ */
+ uchar *a=key, *b=anc_buff+2+nod_flag;
+ uint alen, blen, ft2len=info->s->ft2_keyinfo.keylength;
+ /* the very first key on the page is always unpacked */
+ DBUG_ASSERT((*b & 128) == 0);
+#if HA_FT_MAXLEN >= 127
+ blen= mi_uint2korr(b); b+=2;
+#else
+ blen= *(uchar*) b++;
+#endif
+ get_key_length(alen,a);
+ DBUG_ASSERT(info->ft1_to_ft2==0);
+ if (alen == blen &&
+ ha_compare_text(keyinfo->seg->charset, (uchar*) a, alen,
+ (uchar*) b, blen, 0, 0) == 0)
+ {
+ /* yup. converting */
+ info->ft1_to_ft2=(DYNAMIC_ARRAY *)
+ my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
+ my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50);
+
+ /*
+ now, adding all keys from the page to dynarray
+ if the page is a leaf (if not keys will be deleted later)
+ */
+ if (!nod_flag)
+ {
+ /* let's leave the first key on the page, though, because
+ we cannot easily dispatch an empty page here */
+ b+=blen+ft2len+2;
+ for (a=anc_buff+a_length ; b < a ; b+=ft2len+2)
+ insert_dynamic(info->ft1_to_ft2, (char*) b);
+
+ /* fixing the page's length - it contains only one key now */
+ maria_putint(anc_buff,2+blen+ft2len+2,0);
+ }
+ /* the rest will be done when we're back from recursion */
+ }
+ }
+ DBUG_RETURN(0); /* There is room on page */
+ }
+ /* Page is full */
+ if (nod_flag)
+ insert_last=0;
+ if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ father_buff && !insert_last)
+ DBUG_RETURN(_ma_balance_page(info,keyinfo,key,anc_buff,father_buff,
+ father_key_pos,father_page));
+ DBUG_RETURN(_ma_split_page(info,keyinfo,key,anc_buff,key_buff, insert_last));
+} /* _ma_insert */
+
+
+ /* split a full page in two and assign emerging item to key */
+
+int _ma_split_page(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
+ uchar *key, uchar *buff, uchar *key_buff,
+ my_bool insert_last_key)
+{
+ uint length,a_length,key_ref_length,t_length,nod_flag,key_length;
+ uchar *key_pos,*pos, *after_key;
+ my_off_t new_pos;
+ MARIA_KEY_PARAM s_temp;
+ DBUG_ENTER("maria_split_page");
+ LINT_INIT(after_key);
+ DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff));
+
+ if (info->s->keyinfo+info->lastinx == keyinfo)
+ info->page_changed=1; /* Info->buff is used */
+ info->keyread_buff_used=1;
+ nod_flag=_ma_test_if_nod(buff);
+ key_ref_length=2+nod_flag;
+ if (insert_last_key)
+ key_pos= _ma_find_last_pos(keyinfo,buff,key_buff, &key_length, &after_key);
+ else
+ key_pos= _ma_find_half_pos(nod_flag,keyinfo,buff,key_buff, &key_length,
+ &after_key);
+ if (!key_pos)
+ DBUG_RETURN(-1);
+
+ length=(uint) (key_pos-buff);
+ a_length= maria_data_on_page(buff);
+ maria_putint(buff,length,nod_flag);
+
+ key_pos=after_key;
+ if (nod_flag)
+ {
+ DBUG_PRINT("test",("Splitting nod"));
+ pos=key_pos-nod_flag;
+ memcpy((uchar*) info->buff+2,(uchar*) pos,(size_t) nod_flag);
+ }
+
+ /* Move middle item to key and pointer to new page */
+ if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(-1);
+ _ma_kpointer(info, _ma_move_key(keyinfo,key,key_buff),new_pos);
+
+ /* Store new page */
+ if (!(*keyinfo->get_key)(keyinfo,nod_flag,&key_pos,key_buff))
+ DBUG_RETURN(-1);
+
+ t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,(uchar *) 0,
+ (uchar*) 0, (uchar*) 0,
+ key_buff, &s_temp);
+ length=(uint) ((buff+a_length)-key_pos);
+ memcpy((uchar*) info->buff+key_ref_length+t_length,(uchar*) key_pos,
+ (size_t) length);
+ (*keyinfo->store_key)(keyinfo,info->buff+key_ref_length,&s_temp);
+ maria_putint(info->buff,length+t_length+key_ref_length,nod_flag);
+
+ if (_ma_write_keypage(info,keyinfo,new_pos,DFLT_INIT_HITS,info->buff))
+ DBUG_RETURN(-1);
+ DBUG_DUMP("key",(uchar*) key, _ma_keylength(keyinfo,key));
+ DBUG_RETURN(2); /* Middle key up */
+} /* _ma_split_page */
+
+
+ /*
+ Calculate how to much to move to split a page in two
+ Returns pointer to start of key.
+ key will contain the key.
+ return_key_length will contain the length of key
+ after_key will contain the position to where the next key starts
+ */
+
+uchar *_ma_find_half_pos(uint nod_flag, MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint *return_key_length,
+ uchar **after_key)
+{
+ uint keys,length,key_ref_length;
+ uchar *end,*lastpos;
+ DBUG_ENTER("_ma_find_half_pos");
+
+ key_ref_length=2+nod_flag;
+ length= maria_data_on_page(page)-key_ref_length;
+ page+=key_ref_length;
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)))
+ {
+ key_ref_length=keyinfo->keylength+nod_flag;
+ keys=length/(key_ref_length*2);
+ *return_key_length=keyinfo->keylength;
+ end=page+keys*key_ref_length;
+ *after_key=end+key_ref_length;
+ memcpy(key,end,key_ref_length);
+ DBUG_RETURN(end);
+ }
+
+ end=page+length/2-key_ref_length; /* This is aprox. half */
+ *key='\0';
+ do
+ {
+ lastpos=page;
+ if (!(length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key)))
+ DBUG_RETURN(0);
+ } while (page < end);
+ *return_key_length=length;
+ *after_key=page;
+ DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx half: 0x%lx",
+ (long) lastpos, (long) page, (long) end));
+ DBUG_RETURN(lastpos);
+} /* _ma_find_half_pos */
+
+
+/*
+ Split buffer at last key
+ Returns pointer to the start of the key before the last key
+ key will contain the last key
+*/
+
+static uchar *_ma_find_last_pos(MARIA_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint *return_key_length,
+ uchar **after_key)
+{
+ uint keys,length,last_length,key_ref_length;
+ uchar *end,*lastpos,*prevpos;
+ uchar key_buff[HA_MAX_KEY_BUFF];
+ DBUG_ENTER("_ma_find_last_pos");
+
+ key_ref_length=2;
+ length= maria_data_on_page(page)-key_ref_length;
+ page+=key_ref_length;
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)))
+ {
+ keys=length/keyinfo->keylength-2;
+ *return_key_length=length=keyinfo->keylength;
+ end=page+keys*length;
+ *after_key=end+length;
+ memcpy(key,end,length);
+ DBUG_RETURN(end);
+ }
+
+ LINT_INIT(prevpos);
+ LINT_INIT(last_length);
+ end=page+length-key_ref_length;
+ *key='\0';
+ length=0;
+ lastpos=page;
+ while (page < end)
+ {
+ prevpos=lastpos; lastpos=page;
+ last_length=length;
+ memcpy(key, key_buff, length); /* previous key */
+ if (!(length=(*keyinfo->get_key)(keyinfo,0,&page,key_buff)))
+ {
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ }
+ *return_key_length=last_length;
+ *after_key=lastpos;
+ DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx end: 0x%lx",
+ (long) prevpos,(long) page,(long) end));
+ DBUG_RETURN(prevpos);
+} /* _ma_find_last_pos */
+
+
+ /* Balance page with not packed keys with page on right/left */
+ /* returns 0 if balance was done */
+
+static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uchar *curr_buff, uchar *father_buff,
+ uchar *father_key_pos, my_off_t father_page)
+{
+ my_bool right;
+ uint k_length,father_length,father_keylength,nod_flag,curr_keylength,
+ right_length,left_length,new_right_length,new_left_length,extra_length,
+ length,keys;
+ uchar *pos,*buff,*extra_buff;
+ my_off_t next_page,new_pos;
+ uchar tmp_part_key[HA_MAX_KEY_BUFF];
+ DBUG_ENTER("_ma_balance_page");
+
+ k_length=keyinfo->keylength;
+ father_length= maria_data_on_page(father_buff);
+ father_keylength=k_length+info->s->base.key_reflength;
+ nod_flag=_ma_test_if_nod(curr_buff);
+ curr_keylength=k_length+nod_flag;
+ info->page_changed=1;
+
+ if ((father_key_pos != father_buff+father_length &&
+ (info->state->records & 1)) ||
+ father_key_pos == father_buff+2+info->s->base.key_reflength)
+ {
+ right=1;
+ next_page= _ma_kpos(info->s->base.key_reflength,
+ father_key_pos+father_keylength);
+ buff=info->buff;
+ DBUG_PRINT("test",("use right page: %lu", (ulong) next_page));
+ }
+ else
+ {
+ right=0;
+ father_key_pos-=father_keylength;
+ next_page= _ma_kpos(info->s->base.key_reflength,father_key_pos);
+ /* Fix that curr_buff is to left */
+ buff=curr_buff; curr_buff=info->buff;
+ DBUG_PRINT("test",("use left page: %lu", (ulong) next_page));
+ } /* father_key_pos ptr to parting key */
+
+ if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff,0))
+ goto err;
+ DBUG_DUMP("next",(uchar*) info->buff,maria_data_on_page(info->buff));
+
+ /* Test if there is room to share keys */
+
+ left_length= maria_data_on_page(curr_buff);
+ right_length= maria_data_on_page(buff);
+ keys=(left_length+right_length-4-nod_flag*2)/curr_keylength;
+
+ if ((right ? right_length : left_length) + curr_keylength <=
+ keyinfo->block_length)
+ { /* Merge buffs */
+ new_left_length=2+nod_flag+(keys/2)*curr_keylength;
+ new_right_length=2+nod_flag+((keys+1)/2)*curr_keylength;
+ maria_putint(curr_buff,new_left_length,nod_flag);
+ maria_putint(buff,new_right_length,nod_flag);
+
+ if (left_length < new_left_length)
+ { /* Move keys buff -> leaf */
+ pos=curr_buff+left_length;
+ memcpy((uchar*) pos,(uchar*) father_key_pos, (size_t) k_length);
+ memcpy((uchar*) pos+k_length, (uchar*) buff+2,
+ (size_t) (length=new_left_length - left_length - k_length));
+ pos=buff+2+length;
+ memcpy((uchar*) father_key_pos,(uchar*) pos,(size_t) k_length);
+ bmove((uchar*) buff+2,(uchar*) pos+k_length,new_right_length);
+ }
+ else
+ { /* Move keys -> buff */
+
+ bmove_upp((uchar*) buff+new_right_length,(uchar*) buff+right_length,
+ right_length-2);
+ length=new_right_length-right_length-k_length;
+ memcpy((uchar*) buff+2+length,father_key_pos,(size_t) k_length);
+ pos=curr_buff+new_left_length;
+ memcpy((uchar*) father_key_pos,(uchar*) pos,(size_t) k_length);
+ memcpy((uchar*) buff+2,(uchar*) pos+k_length,(size_t) length);
+ }
+
+ if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff) ||
+ _ma_write_keypage(info,keyinfo,father_page,DFLT_INIT_HITS,father_buff))
+ goto err;
+ DBUG_RETURN(0);
+ }
+
+ /* curr_buff[] and buff[] are full, lets split and make new nod */
+
+ extra_buff=info->buff+info->s->base.max_key_block_length;
+ new_left_length=new_right_length=2+nod_flag+(keys+1)/3*curr_keylength;
+ if (keys == 5) /* Too few keys to balance */
+ new_left_length-=curr_keylength;
+ extra_length=nod_flag+left_length+right_length-
+ new_left_length-new_right_length-curr_keylength;
+ DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d",
+ left_length, right_length,
+ new_left_length, new_right_length,
+ extra_length));
+ maria_putint(curr_buff,new_left_length,nod_flag);
+ maria_putint(buff,new_right_length,nod_flag);
+ maria_putint(extra_buff,extra_length+2,nod_flag);
+
+ /* move first largest keys to new page */
+ pos=buff+right_length-extra_length;
+ memcpy((uchar*) extra_buff+2,pos,(size_t) extra_length);
+ /* Save new parting key */
+ memcpy(tmp_part_key, pos-k_length,k_length);
+ /* Make place for new keys */
+ bmove_upp((uchar*) buff+new_right_length,(uchar*) pos-k_length,
+ right_length-extra_length-k_length-2);
+ /* Copy keys from left page */
+ pos= curr_buff+new_left_length;
+ memcpy((uchar*) buff+2,(uchar*) pos+k_length,
+ (size_t) (length=left_length-new_left_length-k_length));
+ /* Copy old parting key */
+ memcpy((uchar*) buff+2+length,father_key_pos,(size_t) k_length);
+
+ /* Move new parting keys up to caller */
+ memcpy((uchar*) (right ? key : father_key_pos),pos,(size_t) k_length);
+ memcpy((uchar*) (right ? father_key_pos : key),tmp_part_key, k_length);
+
+ if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
+ goto err;
+ _ma_kpointer(info,key+k_length,new_pos);
+ if (_ma_write_keypage(info,keyinfo,(right ? new_pos : next_page),
+ DFLT_INIT_HITS,info->buff) ||
+ _ma_write_keypage(info,keyinfo,(right ? next_page : new_pos),
+ DFLT_INIT_HITS,extra_buff))
+ goto err;
+
+ DBUG_RETURN(1); /* Middle key up */
+
+err:
+ DBUG_RETURN(-1);
+} /* _ma_balance_page */
+
+/**********************************************************************
+ * Bulk insert code *
+ **********************************************************************/
+
+typedef struct {
+ MARIA_HA *info;
+ uint keynr;
+} bulk_insert_param;
+
+
+int _ma_ck_write_tree(register MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length)
+{
+ int error;
+ DBUG_ENTER("_ma_ck_write_tree");
+
+ error= tree_insert(&info->bulk_insert[keynr], key,
+ key_length + info->s->rec_reflength,
+ info->bulk_insert[keynr].custom_arg) ? 0 : HA_ERR_OUT_OF_MEM ;
+
+ DBUG_RETURN(error);
+} /* _ma_ck_write_tree */
+
+
+/* typeof(_ma_keys_compare)=qsort_cmp2 */
+
+static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2)
+{
+ uint not_used[2];
+ return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg,
+ (uchar*) key1, (uchar*) key2, USE_WHOLE_KEY, SEARCH_SAME,
+ not_used);
+}
+
+
+static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param)
+{
+ /*
+ Probably I can use info->lastkey here, but I'm not sure,
+ and to be safe I'd better use local lastkey.
+ */
+ uchar lastkey[HA_MAX_KEY_BUFF];
+ uint keylen;
+ MARIA_KEYDEF *keyinfo;
+
+ switch (mode) {
+ case free_init:
+ if (param->info->s->concurrent_insert)
+ {
+ rw_wrlock(&param->info->s->key_root_lock[param->keynr]);
+ param->info->s->keyinfo[param->keynr].version++;
+ }
+ return 0;
+ case free_free:
+ keyinfo=param->info->s->keyinfo+param->keynr;
+ keylen= _ma_keylength(keyinfo, key);
+ memcpy(lastkey, key, keylen);
+ return _ma_ck_write_btree(param->info,param->keynr,lastkey,
+ keylen - param->info->s->rec_reflength);
+ case free_end:
+ if (param->info->s->concurrent_insert)
+ rw_unlock(&param->info->s->key_root_lock[param->keynr]);
+ return 0;
+ }
+ return -1;
+}
+
+
+int maria_init_bulk_insert(MARIA_HA *info, ulong cache_size, ha_rows rows)
+{
+ MARIA_SHARE *share=info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ bulk_insert_param *params;
+ uint i, num_keys, total_keylength;
+ ulonglong key_map;
+ DBUG_ENTER("_ma_init_bulk_insert");
+ DBUG_PRINT("enter",("cache_size: %lu", cache_size));
+
+ DBUG_ASSERT(!info->bulk_insert &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT));
+
+ maria_clear_all_keys_active(key_map);
+ for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++)
+ {
+ if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) &&
+ maria_is_key_active(share->state.key_map, i))
+ {
+ num_keys++;
+ maria_set_key_active(key_map, i);
+ total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE;
+ }
+ }
+
+ if (num_keys==0 ||
+ num_keys * MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size)
+ DBUG_RETURN(0);
+
+ if (rows && rows*total_keylength < cache_size)
+ cache_size=rows;
+ else
+ cache_size/=total_keylength*16;
+
+ info->bulk_insert=(TREE *)
+ my_malloc((sizeof(TREE)*share->base.keys+
+ sizeof(bulk_insert_param)*num_keys),MYF(0));
+
+ if (!info->bulk_insert)
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+ params=(bulk_insert_param *)(info->bulk_insert+share->base.keys);
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ if (maria_is_key_active(key_map, i))
+ {
+ params->info=info;
+ params->keynr=i;
+ /* Only allocate a 16'th of the buffer at a time */
+ init_tree(&info->bulk_insert[i],
+ cache_size * key[i].maxlength,
+ cache_size * key[i].maxlength, 0,
+ (qsort_cmp2)keys_compare, 0,
+ (tree_element_free) keys_free, (void *)params++);
+ }
+ else
+ info->bulk_insert[i].root=0;
+ }
+
+ DBUG_RETURN(0);
+}
+
+void maria_flush_bulk_insert(MARIA_HA *info, uint inx)
+{
+ if (info->bulk_insert)
+ {
+ if (is_tree_inited(&info->bulk_insert[inx]))
+ reset_tree(&info->bulk_insert[inx]);
+ }
+}
+
+void maria_end_bulk_insert(MARIA_HA *info)
+{
+ DBUG_ENTER("maria_end_bulk_insert");
+ if (info->bulk_insert)
+ {
+ uint i;
+ for (i=0 ; i < info->s->base.keys ; i++)
+ {
+ if (is_tree_inited(& info->bulk_insert[i]))
+ {
+ delete_tree(& info->bulk_insert[i]);
+ }
+ }
+ my_free((void *)info->bulk_insert, MYF(0));
+ info->bulk_insert=0;
+ }
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c
new file mode 100644
index 00000000000..f9ed249817e
--- /dev/null
+++ b/storage/maria/maria_chk.c
@@ -0,0 +1,1841 @@
+/* Copyright (C) 2006-2003 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Describe, check and repair of MARIA tables */
+
+#include "ma_fulltext.h"
+#include <myisamchk.h>
+#include <my_bit.h>
+#include <m_ctype.h>
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVICE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+SET_STACK_SIZE(9000) /* Minimum stack size for program */
+
+#ifndef USE_RAID
+#define my_raid_create(A,B,C,D,E,F,G) my_create(A,B,C,G)
+#define my_raid_delete(A,B,C) my_delete(A,B)
+#endif
+
+static uint decode_bits;
+static char **default_argv;
+static const char *load_default_groups[]= { "maria_chk", 0 };
+static const char *set_collation_name, *opt_tmpdir;
+static CHARSET_INFO *set_collation;
+static const char *my_progname_short;
+static int stopwords_inited= 0;
+static MY_TMPDIR maria_chk_tmpdir;
+
+static const char *type_names[]=
+{
+ "impossible","char","binary", "short", "long", "float",
+ "double","number","unsigned short",
+ "unsigned long","longlong","ulonglong","int24",
+ "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit",
+ "?","?"
+};
+
+static const char *prefix_packed_txt="packed ",
+ *bin_packed_txt="prefix ",
+ *diff_txt="stripped ",
+ *null_txt="NULL",
+ *blob_txt="BLOB ";
+
+static const char *field_pack[]=
+{
+ "","no endspace", "no prespace",
+ "no zeros", "blob", "constant", "table-lockup",
+ "always zero","varchar","unique-hash","?","?"
+};
+
+static const char *record_formats[]=
+{
+ "Fixed length", "Packed", "Compressed", "Block", "?"
+};
+
+static const char *maria_stats_method_str="nulls_unequal";
+
+static void get_options(int *argc,char * * *argv);
+static void print_version(void);
+static void usage(void);
+static int maria_chk(HA_CHECK *param, char *filename);
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name);
+static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info,
+ char *name, uint sort_key,
+ my_bool write_info, my_bool update_index);
+static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff,uint sortkey,
+ File new_file, my_bool update_index);
+
+HA_CHECK check_param;
+
+ /* Main program */
+
+int main(int argc, char **argv)
+{
+ int error;
+ MY_INIT(argv[0]);
+ my_progname_short= my_progname+dirname_length(my_progname);
+
+ maria_chk_init(&check_param);
+ check_param.opt_lock_memory= 1; /* Lock memory if possible */
+ check_param.using_global_keycache = 0;
+ get_options(&argc,(char***) &argv);
+ maria_quick_table_bits=decode_bits;
+ error=0;
+ maria_init();
+
+ while (--argc >= 0)
+ {
+ int new_error=maria_chk(&check_param, *(argv++));
+ if ((check_param.testflag & T_REP_ANY) != T_REP)
+ check_param.testflag&= ~T_REP;
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ if ((check_param.error_printed | check_param.warning_printed) &&
+ (check_param.testflag & T_FORCE_CREATE) &&
+ (!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS |
+ T_SORT_INDEX))))
+ {
+ ulonglong old_testflag=check_param.testflag;
+ if (!(check_param.testflag & T_REP))
+ check_param.testflag|= T_REP_BY_SORT;
+ check_param.testflag&= ~T_EXTEND; /* Don't needed */
+ error|=maria_chk(&check_param, argv[-1]);
+ check_param.testflag= old_testflag;
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ }
+ else
+ error|=new_error;
+ if (argc && (!(check_param.testflag & T_SILENT) ||
+ check_param.testflag & T_INFO))
+ {
+ puts("\n---------\n");
+ VOID(fflush(stdout));
+ }
+ }
+ if (check_param.total_files > 1)
+ { /* Only if descript */
+ char buff[22],buff2[22];
+ if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO)
+ puts("\n---------\n");
+ printf("\nTotal of all %d MARIA-files:\nData records: %9s Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff),
+ llstr(check_param.total_deleted,buff2));
+ }
+ free_defaults(default_argv);
+ free_tmpdir(&maria_chk_tmpdir);
+ maria_end();
+ my_end(check_param.testflag & T_INFO ?
+ MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+ exit(error);
+#ifndef _lint
+ return 0; /* No compiler warning */
+#endif
+} /* main */
+
+enum options_mc {
+ OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS,
+ OPT_CORRECT_CHECKSUM, OPT_KEY_BUFFER_SIZE,
+ OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE,
+ OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE,
+ OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN,
+ OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE,
+ OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD
+};
+
+static struct my_option my_long_options[] =
+{
+ {"analyze", 'a',
+ "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifdef __NETWARE__
+ {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"block-search", 'b',
+ "No help available.",
+ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"backup", 'B',
+ "Make a backup of the .MYD file as 'filename-time.BAK'.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR,
+ "Directory where character sets are.",
+ (uchar**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"check", 'c',
+ "Check table for errors.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"check-only-changed", 'C',
+ "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"correct-checksum", OPT_CORRECT_CHECKSUM,
+ "Correct checksum information for table.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#',
+ "Output debug log. Often this is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"description", 'd',
+ "Prints some information about table.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"data-file-length", 'D',
+ "Max length of data file (when recreating data-file when it's full).",
+ (uchar**) &check_param.max_data_file_length,
+ (uchar**) &check_param.max_data_file_length,
+ 0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"extend-check", 'e',
+ "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"fast", 'F',
+ "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"force", 'f',
+ "Restart with -r if there are any errors in the table. States will be updated as with --update-state.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"HELP", 'H',
+ "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?',
+ "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"information", 'i',
+ "Print statistics information about table that is checked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"keys-used", 'k',
+ "Tell MARIA to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.",
+ (uchar**) &check_param.keys_in_use,
+ (uchar**) &check_param.keys_in_use,
+ 0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0},
+ {"max-record-length", OPT_MAX_RECORD_LENGTH,
+ "Skip rows bigger than this if maria_chk can't allocate memory to hold it",
+ (uchar**) &check_param.max_record_length,
+ (uchar**) &check_param.max_record_length,
+ 0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
+ {"medium-check", 'm',
+ "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"quick", 'q', "Faster repair by not modifying the data file.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"read-only", 'T',
+ "Don't mark table as checked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"recover", 'r',
+ "Can fix almost anything except unique keys that aren't unique.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"parallel-recover", 'p',
+ "Same as '-r' but creates all the keys in parallel.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"safe-recover", 'o',
+ "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"sort-recover", 'n',
+ "Force recovering with sorting even if the temporary file was very big.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifdef DEBUG
+ {"start-check-pos", OPT_START_CHECK_POS,
+ "No help available.",
+ 0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"set-auto-increment", 'A',
+ "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.",
+ (uchar**) &check_param.auto_increment_value,
+ (uchar**) &check_param.auto_increment_value,
+ 0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"set-collation", OPT_SET_COLLATION,
+ "Change the collation used by the index",
+ (uchar**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"set-variable", 'O',
+ "Change the value of a variable. Please note that this option is deprecated; you can set variables directly with --variable-name=value.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's',
+ "Only print errors. One can use two -s to make maria_chk very silent.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"sort-index", 'S',
+ "Sort index blocks. This speeds up 'read-next' in applications.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"sort-records", 'R',
+ "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)",
+ (uchar**) &check_param.opt_sort_key,
+ (uchar**) &check_param.opt_sort_key,
+ 0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"tmpdir", 't',
+ "Path for temporary files.",
+ (uchar**) &opt_tmpdir,
+ 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"update-state", 'U',
+ "Mark tables as crashed if any errors were found.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"unpack", 'u',
+ "Unpack file packed with mariapack.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v',
+ "Print more information. This can be used with --description and --check. Use many -v for more verbosity!",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V',
+ "Print version and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"wait", 'w',
+ "Wait if table is locked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { "key_buffer_size", OPT_KEY_BUFFER_SIZE, "",
+ (uchar**) &check_param.use_buffers, (uchar**) &check_param.use_buffers, 0,
+ GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0},
+ { "read_buffer_size", OPT_READ_BUFFER_SIZE, "",
+ (uchar**) &check_param.read_buffer_length,
+ (uchar**) &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "write_buffer_size", OPT_WRITE_BUFFER_SIZE, "",
+ (uchar**) &check_param.write_buffer_length,
+ (uchar**) &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "sort_buffer_size", OPT_SORT_BUFFER_SIZE, "",
+ (uchar**) &check_param.sort_buffer_length,
+ (uchar**) &check_param.sort_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) SORT_BUFFER_INIT, (long) (MIN_SORT_BUFFER + MALLOC_OVERHEAD),
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "sort_key_blocks", OPT_SORT_KEY_BLOCKS, "",
+ (uchar**) &check_param.sort_key_blocks,
+ (uchar**) &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG,
+ BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0},
+ { "decode_bits", OPT_DECODE_BITS, "", (uchar**) &decode_bits,
+ (uchar**) &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0},
+ { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", (uchar**) &ft_min_word_len,
+ (uchar**) &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN,
+ 0, 1, 0},
+ { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", (uchar**) &ft_max_word_len,
+ (uchar**) &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10,
+ HA_FT_MAXCHARLEN, 0, 1, 0},
+ { "maria_ft_stopword_file", OPT_FT_STOPWORD_FILE,
+ "Use stopwords from this file instead of built-in list.",
+ (uchar**) &ft_stopword_file, (uchar**) &ft_stopword_file, 0, GET_STR,
+ REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"stats_method", OPT_STATS_METHOD,
+ "Specifies how index statistics collection code should threat NULLs. "
+ "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), "
+ "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".",
+ (uchar**) &maria_stats_method_str, (uchar**) &maria_stats_method_str, 0,
+ GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ printf("%s Ver 1.0 for %s at %s\n", my_progname, SYSTEM_TYPE,
+ MACHINE_TYPE);
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("By Monty, for your professional use");
+ puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n");
+ puts("Description, check and repair of MARIA tables.");
+ puts("Used without options all tables on the command will be checked for errors");
+ printf("Usage: %s [OPTIONS] tables[.MYI]\n", my_progname_short);
+ printf("\nGlobal options:\n");
+#ifndef DBUG_OFF
+ printf("\
+ -#, --debug=... Output debug log. Often this is 'd:t:o,filename'.\n");
+#endif
+ printf("\
+ -?, --help Display this help and exit.\n\
+ -O, --set-variable var=option.\n\
+ Change the value of a variable. Please note that\n\
+ this option is deprecated; you can set variables\n\
+ directly with '--variable-name=value'.\n\
+ -t, --tmpdir=path Path for temporary files. Multiple paths can be\n\
+ specified, separated by ");
+#if defined( __WIN__) || defined(__NETWARE__)
+ printf("semicolon (;)");
+#else
+ printf("colon (:)");
+#endif
+ printf(", they will be used\n\
+ in a round-robin fashion.\n\
+ -s, --silent Only print errors. One can use two -s to make\n\
+ maria_chk very silent.\n\
+ -v, --verbose Print more information. This can be used with\n\
+ --description and --check. Use many -v for more verbosity.\n\
+ -V, --version Print version and exit.\n\
+ -w, --wait Wait if table is locked.\n\n");
+#ifdef DEBUG
+ puts(" --start-check-pos=# Start reading file at given offset.\n");
+#endif
+
+ puts("Check options (check is the default action for maria_chk):\n\
+ -c, --check Check table for errors.\n\
+ -e, --extend-check Check the table VERY throughly. Only use this in\n\
+ extreme cases as maria_chk should normally be able to\n\
+ find out if the table is ok even without this switch.\n\
+ -F, --fast Check only tables that haven't been closed properly.\n\
+ -C, --check-only-changed\n\
+ Check only tables that have changed since last check.\n\
+ -f, --force Restart with '-r' if there are any errors in the table.\n\
+ States will be updated as with '--update-state'.\n\
+ -i, --information Print statistics information about table that is checked.\n\
+ -m, --medium-check Faster than extend-check, but only finds 99.99% of\n\
+ all errors. Should be good enough for most cases.\n\
+ -U --update-state Mark tables as crashed if you find any errors.\n\
+ -T, --read-only Don't mark table as checked.\n");
+
+ puts("Repair options (When using '-r' or '-o'):\n\
+ -B, --backup Make a backup of the .MYD file as 'filename-time.BAK'.\n\
+ --correct-checksum Correct checksum information for table.\n\
+ -D, --data-file-length=# Max length of data file (when recreating data\n\
+ file when it's full).\n\
+ -e, --extend-check Try to recover every possible row from the data file\n\
+ Normally this will also find a lot of garbage rows;\n\
+ Don't use this option if you are not totally desperate.\n\
+ -f, --force Overwrite old temporary files.\n\
+ -k, --keys-used=# Tell MARIA to update only some specific keys. # is a\n\
+ bit mask of which keys to use. This can be used to\n\
+ get faster inserts.\n\
+ --max-record-length=#\n\
+ Skip rows bigger than this if maria_chk can't allocate\n\
+ memory to hold it.\n\
+ -r, --recover Can fix almost anything except unique keys that aren't\n\
+ unique.\n\
+ -n, --sort-recover Forces recovering with sorting even if the temporary\n\
+ file would be very big.\n\
+ -p, --parallel-recover\n\
+ Uses the same technique as '-r' and '-n', but creates\n\
+ all the keys in parallel, in different threads.\n\
+ -o, --safe-recover Uses old recovery method; Slower than '-r' but can\n\
+ handle a couple of cases where '-r' reports that it\n\
+ can't fix the data file.\n\
+ --character-sets-dir=...\n\
+ Directory where character sets are.\n\
+ --set-collation=name\n\
+ Change the collation used by the index.\n\
+ -q, --quick Faster repair by not modifying the data file.\n\
+ One can give a second '-q' to force maria_chk to\n\
+ modify the original datafile in case of duplicate keys.\n\
+ NOTE: Tables where the data file is currupted can't be\n\
+ fixed with this option.\n\
+ -u, --unpack Unpack file packed with mariapack.\n\
+");
+
+ puts("Other actions:\n\
+ -a, --analyze Analyze distribution of keys. Will make some joins in\n\
+ MySQL faster. You can check the calculated distribution\n\
+ by using '--description --verbose table_name'.\n\
+ --stats_method=name Specifies how index statistics collection code should\n\
+ threat NULLs. Possible values of name are \"nulls_unequal\"\n\
+ (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\
+ \"nulls_ignored\".\n\
+ -d, --description Prints some information about table.\n\
+ -A, --set-auto-increment[=value]\n\
+ Force auto_increment to start at this or higher value\n\
+ If no value is given, then sets the next auto_increment\n\
+ value to the highest used value for the auto key + 1.\n\
+ -S, --sort-index Sort index blocks. This speeds up 'read-next' in\n\
+ applications.\n\
+ -R, --sort-records=#\n\
+ Sort records according to an index. This makes your\n\
+ data much more localized and may speed up things\n\
+ (It may be VERY slow to do a sort the first time!).\n\
+ -b, --block-search=#\n\
+ Find a record, a block at given offset belongs to.");
+
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal",
+ "nulls_ignored", NullS};
+TYPELIB maria_stats_method_typelib= {
+ array_elements(maria_stats_method_names) - 1, "",
+ maria_stats_method_names, NULL};
+
+ /* Read options */
+
+static my_bool
+get_one_option(int optid,
+ const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch (optid) {
+#ifdef __NETWARE__
+ case OPT_AUTO_CLOSE:
+ setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+ break;
+#endif
+ case 'a':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_STATISTICS;
+ else
+ check_param.testflag|= T_STATISTICS;
+ break;
+ case 'A':
+ if (argument)
+ check_param.auto_increment_value= strtoull(argument, NULL, 0);
+ else
+ check_param.auto_increment_value= 0; /* Set to max used value */
+ check_param.testflag|= T_AUTO_INC;
+ break;
+ case 'b':
+ check_param.search_after_block= strtoul(argument, NULL, 10);
+ break;
+ case 'B':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_BACKUP_DATA;
+ else
+ check_param.testflag|= T_BACKUP_DATA;
+ break;
+ case 'c':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_CHECK;
+ else
+ check_param.testflag|= T_CHECK;
+ break;
+ case 'C':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED);
+ else
+ check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED;
+ break;
+ case 'D':
+ check_param.max_data_file_length=strtoll(argument, NULL, 10);
+ break;
+ case 's': /* silent */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_SILENT | T_VERY_SILENT);
+ else
+ {
+ if (check_param.testflag & T_SILENT)
+ check_param.testflag|= T_VERY_SILENT;
+ check_param.testflag|= T_SILENT;
+ check_param.testflag&= ~T_WRITE_LOOP;
+ }
+ break;
+ case 'w':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_WAIT_FOREVER;
+ else
+ check_param.testflag|= T_WAIT_FOREVER;
+ break;
+ case 'd': /* description if isam-file */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_DESCRIPT;
+ else
+ check_param.testflag|= T_DESCRIPT;
+ break;
+ case 'e': /* extend check */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_EXTEND;
+ else
+ check_param.testflag|= T_EXTEND;
+ break;
+ case 'i':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_INFO;
+ else
+ check_param.testflag|= T_INFO;
+ break;
+ case 'f':
+ if (argument == disabled_my_option)
+ {
+ check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL;
+ check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE);
+ }
+ else
+ {
+ check_param.tmpfile_createflag= O_RDWR | O_TRUNC;
+ check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE;
+ }
+ break;
+ case 'F':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_FAST;
+ else
+ check_param.testflag|= T_FAST;
+ break;
+ case 'k':
+ check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10);
+ break;
+ case 'm':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_MEDIUM;
+ else
+ check_param.testflag|= T_MEDIUM; /* Medium check */
+ break;
+ case 'r': /* Repair table */
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument != disabled_my_option)
+ check_param.testflag|= T_REP_BY_SORT;
+ break;
+ case 'p':
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument != disabled_my_option)
+ check_param.testflag|= T_REP_PARALLEL;
+ break;
+ case 'o':
+ check_param.testflag&= ~T_REP_ANY;
+ check_param.force_sort= 0;
+ if (argument != disabled_my_option)
+ {
+ check_param.testflag|= T_REP;
+ my_disable_async_io= 1; /* More safety */
+ }
+ break;
+ case 'n':
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument == disabled_my_option)
+ check_param.force_sort= 0;
+ else
+ {
+ check_param.testflag|= T_REP_BY_SORT;
+ check_param.force_sort= 1;
+ }
+ break;
+ case 'q':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS);
+ else
+ check_param.testflag|=
+ (check_param.testflag & T_QUICK) ? T_FORCE_UNIQUENESS : T_QUICK;
+ break;
+ case 'u':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_UNPACK | T_REP_BY_SORT);
+ else
+ check_param.testflag|= T_UNPACK | T_REP_BY_SORT;
+ break;
+ case 'v': /* Verbose */
+ if (argument == disabled_my_option)
+ {
+ check_param.testflag&= ~T_VERBOSE;
+ check_param.verbose=0;
+ }
+ else
+ {
+ check_param.testflag|= T_VERBOSE;
+ check_param.verbose++;
+ }
+ break;
+ case 'R': /* Sort records */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_SORT_RECORDS;
+ else
+ {
+ check_param.testflag|= T_SORT_RECORDS;
+ check_param.opt_sort_key= (uint) atoi(argument) - 1;
+ if (check_param.opt_sort_key >= MARIA_MAX_KEY)
+ {
+ fprintf(stderr,
+ "The value of the sort key is bigger than max key: %d.\n",
+ MARIA_MAX_KEY);
+ exit(1);
+ }
+ }
+ break;
+ case 'S': /* Sort index */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_SORT_INDEX;
+ else
+ check_param.testflag|= T_SORT_INDEX;
+ break;
+ case 'T':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_READONLY;
+ else
+ check_param.testflag|= T_READONLY;
+ break;
+ case 'U':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_UPDATE_STATE;
+ else
+ check_param.testflag|= T_UPDATE_STATE;
+ break;
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/maria_chk.trace");
+ break;
+ case 'V':
+ print_version();
+ exit(0);
+ case OPT_CORRECT_CHECKSUM:
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_CALC_CHECKSUM;
+ else
+ check_param.testflag|= T_CALC_CHECKSUM;
+ break;
+ case OPT_STATS_METHOD:
+ {
+ int method;
+ enum_handler_stats_method method_conv;
+ LINT_INIT(method_conv);
+ maria_stats_method_str= argument;
+ if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0)
+ {
+ fprintf(stderr, "Invalid value of stats_method: %s.\n", argument);
+ exit(1);
+ }
+ switch (method-1) {
+ case 0:
+ method_conv= MI_STATS_METHOD_NULLS_EQUAL;
+ break;
+ case 1:
+ method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+ break;
+ case 2:
+ method_conv= MI_STATS_METHOD_IGNORE_NULLS;
+ break;
+ default: assert(0); /* Impossible */
+ }
+ check_param.stats_method= method_conv;
+ break;
+ }
+#ifdef DEBUG /* Only useful if debugging */
+ case OPT_START_CHECK_POS:
+ check_param.start_check_pos= strtoull(argument, NULL, 0);
+ break;
+#endif
+ case 'H':
+ my_print_help(my_long_options);
+ exit(0);
+ case '?':
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+
+static void get_options(register int *argc,register char ***argv)
+{
+ int ho_error;
+
+ load_defaults("my", load_default_groups, argc, argv);
+ default_argv= *argv;
+ if (isatty(fileno(stdout)))
+ check_param.testflag|=T_WRITE_LOOP;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ /* If using repair, then update checksum if one uses --update-state */
+ if ((check_param.testflag & T_UPDATE_STATE) &&
+ (check_param.testflag & T_REP_ANY))
+ check_param.testflag|= T_CALC_CHECKSUM;
+
+ if (*argc == 0)
+ {
+ usage();
+ exit(-1);
+ }
+
+ if ((check_param.testflag & T_UNPACK) &&
+ (check_param.testflag & (T_QUICK | T_SORT_RECORDS)))
+ {
+ VOID(fprintf(stderr,
+ "%s: --unpack can't be used with --quick or --sort-records\n",
+ my_progname_short));
+ exit(1);
+ }
+ if ((check_param.testflag & T_READONLY) &&
+ (check_param.testflag &
+ (T_REP_ANY | T_STATISTICS | T_AUTO_INC |
+ T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE)))
+ {
+ VOID(fprintf(stderr,
+ "%s: Can't use --readonly when repairing or sorting\n",
+ my_progname_short));
+ exit(1);
+ }
+
+ if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
+ exit(1);
+
+ check_param.tmpdir=&maria_chk_tmpdir;
+
+ if (set_collation_name)
+ if (!(set_collation= get_charset_by_name(set_collation_name,
+ MYF(MY_WME))))
+ exit(1);
+
+ return;
+} /* get options */
+
+
+ /* Check table */
+
+static int maria_chk(HA_CHECK *param, char *filename)
+{
+ int error,lock_type,recreate;
+ int rep_quick= param->testflag & (T_QUICK | T_FORCE_UNIQUENESS);
+ MARIA_HA *info;
+ File datafile;
+ char llbuff[22],llbuff2[22];
+ my_bool state_updated=0;
+ MARIA_SHARE *share;
+ DBUG_ENTER("maria_chk");
+
+ param->out_flag=error=param->warning_printed=param->error_printed=
+ recreate=0;
+ datafile=0;
+ param->isam_file_name=filename; /* For error messages */
+ if (!(info=maria_open(filename,
+ (param->testflag & (T_DESCRIPT | T_READONLY)) ?
+ O_RDONLY : O_RDWR,
+ HA_OPEN_FOR_REPAIR |
+ ((param->testflag & T_WAIT_FOREVER) ?
+ HA_OPEN_WAIT_IF_LOCKED :
+ (param->testflag & T_DESCRIPT) ?
+ HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED))))
+ {
+ /* Avoid twice printing of isam file name */
+ param->error_printed=1;
+ switch (my_errno) {
+ case HA_ERR_CRASHED:
+ _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
+ break;
+ case HA_ERR_NOT_A_TABLE:
+ _ma_check_print_error(param,"'%s' is not a MARIA-table",filename);
+ break;
+ case HA_ERR_CRASHED_ON_USAGE:
+ _ma_check_print_error(param,"'%s' is marked as crashed",filename);
+ break;
+ case HA_ERR_CRASHED_ON_REPAIR:
+ _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename);
+ break;
+ case HA_ERR_OLD_FILE:
+ _ma_check_print_error(param,"'%s' is a old type of MARIA-table", filename);
+ break;
+ case HA_ERR_END_OF_FILE:
+ _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename);
+ break;
+ case EAGAIN:
+ _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename);
+ break;
+ case ENOENT:
+ _ma_check_print_error(param,"File '%s' doesn't exist",filename);
+ break;
+ case EACCES:
+ _ma_check_print_error(param,"You don't have permission to use '%s'",
+ filename);
+ break;
+ default:
+ _ma_check_print_error(param,"%d when opening MARIA-table '%s'",
+ my_errno,filename);
+ break;
+ }
+ DBUG_RETURN(1);
+ }
+ share=info->s;
+ share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
+ share->tot_locks-= share->r_locks;
+ share->r_locks=0;
+ maria_block_size= share->base.block_size;
+
+ if (share->data_file_type == BLOCK_RECORD ||
+ ((param->testflag & T_UNPACK) &&
+ share->state.header.org_data_file_type == BLOCK_RECORD))
+ {
+ if (param->testflag & T_SORT_RECORDS)
+ {
+ _ma_check_print_error(param,
+ "Record format used by '%s' is is not yet supported with repair/check",
+ filename);
+ param->error_printed= 0;
+ error= 1;
+ goto end2;
+ }
+ /* We can't do parallell repair with BLOCK_RECORD yet */
+ if (param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL))
+ {
+ param->testflag&= ~(T_REP_BY_SORT | T_REP_PARALLEL);
+ param->testflag|= T_REP;
+ }
+ }
+
+ /*
+ Skip the checking of the file if:
+ We are using --fast and the table is closed properly
+ We are using --check-only-changed-tables and the table hasn't changed
+ */
+ if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED))
+ {
+ my_bool need_to_check= (maria_is_crashed(info) ||
+ share->state.open_count != 0);
+
+ if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) &&
+ ((share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR) ||
+ !(param->testflag & T_CHECK_ONLY_CHANGED))))
+ need_to_check=1;
+
+ if (info->s->base.keys && info->state->records)
+ {
+ if ((param->testflag & T_STATISTICS) &&
+ (share->state.changed & STATE_NOT_ANALYZED))
+ need_to_check=1;
+ if ((param->testflag & T_SORT_INDEX) &&
+ (share->state.changed & STATE_NOT_SORTED_PAGES))
+ need_to_check=1;
+ if ((param->testflag & T_REP_BY_SORT) &&
+ (share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+ need_to_check=1;
+ }
+ if ((param->testflag & T_CHECK_ONLY_CHANGED) &&
+ (share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR)))
+ need_to_check=1;
+ if (!need_to_check)
+ {
+ if (!(param->testflag & T_SILENT) || param->testflag & T_INFO)
+ printf("MARIA file: %s is already checked\n",filename);
+ if (maria_close(info))
+ {
+ _ma_check_print_error(param,"%d when closing MARIA-table '%s'",
+ my_errno,filename);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+ }
+ }
+ if ((param->testflag & (T_REP_ANY | T_STATISTICS |
+ T_SORT_RECORDS | T_SORT_INDEX)) &&
+ (((param->testflag & T_UNPACK) &&
+ share->data_file_type == COMPRESSED_RECORD) ||
+ mi_uint2korr(share->state.header.state_info_length) !=
+ MARIA_STATE_INFO_SIZE ||
+ mi_uint2korr(share->state.header.base_info_length) !=
+ MARIA_BASE_INFO_SIZE ||
+ maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys,
+ ~share->state.key_map) ||
+ maria_test_if_almost_full(info) ||
+ info->s->state.header.file_version[3] != maria_file_magic[3] ||
+ (set_collation &&
+ set_collation->number != share->state.header.language)))
+ {
+ if (set_collation)
+ param->language= set_collation->number;
+ if (maria_recreate_table(param, &info,filename))
+ {
+ VOID(fprintf(stderr,
+ "MARIA-table '%s' is not fixed because of errors\n",
+ filename));
+ return(-1);
+ }
+ recreate=1;
+ if (!(param->testflag & T_REP_ANY))
+ {
+ param->testflag|=T_REP_BY_SORT; /* if only STATISTICS */
+ if (!(param->testflag & T_SILENT))
+ printf("- '%s' has old table-format. Recreating index\n",filename);
+ rep_quick|=T_QUICK;
+ }
+ share=info->s;
+ share->tot_locks-= share->r_locks;
+ share->r_locks=0;
+ }
+
+ if (param->testflag & T_DESCRIPT)
+ {
+ param->total_files++;
+ param->total_records+=info->state->records;
+ param->total_deleted+=info->state->del;
+ descript(param, info, filename);
+ maria_close(info); /* Should always succeed */
+ return(0);
+ }
+
+ if (!stopwords_inited++)
+ ft_init_stopwords();
+
+ if (!(param->testflag & T_READONLY))
+ lock_type = F_WRLCK; /* table is changed */
+ else
+ lock_type= F_RDLCK;
+ if (info->lock_type == F_RDLCK)
+ info->lock_type=F_UNLCK; /* Read only table */
+ if (_ma_readinfo(info,lock_type,0))
+ {
+ _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d",
+ filename,my_errno);
+ param->error_printed=0;
+ error= 1;
+ goto end2;
+ }
+ /*
+ _ma_readinfo() has locked the table.
+ We mark the table as locked (without doing file locks) to be able to
+ use functions that only works on locked tables (like row caching).
+ */
+ maria_lock_database(info, F_EXTRA_LCK);
+ datafile= info->dfile.file;
+ if (init_pagecache(maria_pagecache, param->use_buffers, 0, 0,
+ maria_block_size) == 0)
+ {
+ _ma_check_print_error(param, "Can't initialize page cache with %lu memory",
+ (ulong) param->use_buffers);
+ error= 1;
+ goto end2;
+ }
+
+ if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX))
+ {
+ if (param->testflag & T_REP_ANY)
+ {
+ ulonglong tmp=share->state.key_map;
+ maria_copy_keys_active(share->state.key_map, share->base.keys,
+ param->keys_in_use);
+ if (tmp != share->state.key_map)
+ info->update|=HA_STATE_CHANGED;
+ }
+ if (rep_quick &&
+ maria_chk_del(param, info, param->testflag & ~T_VERBOSE))
+ {
+ if (param->testflag & T_FORCE_CREATE)
+ {
+ rep_quick=0;
+ _ma_check_print_info(param,"Creating new data file\n");
+ }
+ else
+ {
+ error=1;
+ _ma_check_print_error(param,
+ "Quick-recover aborted; Run recovery without switch 'q'");
+ }
+ }
+ if (!error)
+ {
+ /*
+ Tell the server's Recovery to ignore old REDOs on this table; we don't
+ know what the log's end LSN is now, so we just let the server know
+ that it will have to find and store it.
+ This is the only case where create_rename_lsn can be a horizon and not
+ a LSN.
+ */
+ if (share->base.born_transactional)
+ share->state.create_rename_lsn= share->state.is_of_horizon=
+ LSN_REPAIRED_BY_MARIA_CHK;
+ if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) &&
+ (maria_is_any_key_active(share->state.key_map) ||
+ (rep_quick && !param->keys_in_use && !recreate)) &&
+ maria_test_if_sort_rep(info, info->state->records,
+ info->s->state.key_map,
+ param->force_sort))
+ {
+ if (param->testflag & T_REP_BY_SORT)
+ error=maria_repair_by_sort(param,info,filename,rep_quick);
+ else
+ error=maria_repair_parallel(param,info,filename,rep_quick);
+ state_updated=1;
+ }
+ else if (param->testflag & T_REP_ANY)
+ error=maria_repair(param, info,filename,rep_quick);
+ }
+ if (!error && param->testflag & T_SORT_RECORDS)
+ {
+ /*
+ The data file is nowadays reopened in the repair code so we should
+ soon remove the following reopen-code
+ */
+#ifndef TO_BE_REMOVED
+ if (param->out_flag & O_NEW_DATA)
+ { /* Change temp file to org file */
+ VOID(my_close(info->dfile.file, MYF(MY_WME))); /* Close new file */
+ error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+ MYF(0));
+ if (_ma_open_datafile(info,info->s, -1))
+ error=1;
+ param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */
+ param->read_cache.file= info->dfile.file;
+ }
+#endif
+ if (! error)
+ {
+ uint key;
+ /*
+ We can't update the index in maria_sort_records if we have a
+ prefix compressed or fulltext index
+ */
+ my_bool update_index=1;
+ for (key=0 ; key < share->base.keys; key++)
+ if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT))
+ update_index=0;
+
+ error=maria_sort_records(param,info,filename,param->opt_sort_key,
+ /* what is the following parameter for ? */
+ (my_bool) !(param->testflag & T_REP),
+ update_index);
+ datafile= info->dfile.file; /* This is now locked */
+ if (!error && !update_index)
+ {
+ if (param->verbose)
+ puts("Table had a compressed index; We must now recreate the index");
+ error=maria_repair_by_sort(param,info,filename,1);
+ }
+ }
+ }
+ if (!error && param->testflag & T_SORT_INDEX)
+ error=maria_sort_index(param,info,filename);
+ if (!error)
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR);
+ else
+ maria_mark_crashed(info);
+ }
+ else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC))
+ {
+ if (!(param->testflag & T_SILENT) || param->testflag & T_INFO)
+ printf("Checking MARIA file: %s\n",filename);
+ if (!(param->testflag & T_SILENT))
+ printf("Data records: %7s Deleted blocks: %7s\n",
+ llstr(info->state->records,llbuff),
+ llstr(info->state->del,llbuff2));
+ error =maria_chk_status(param,info);
+ maria_intersect_keys_active(share->state.key_map, param->keys_in_use);
+ error =maria_chk_size(param,info);
+ if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+ error|=maria_chk_del(param, info,param->testflag);
+ if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) &&
+ !param->start_check_pos)))
+ {
+ error|=maria_chk_key(param, info);
+ if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC)))
+ error=maria_update_state_info(param, info,
+ ((param->testflag & T_STATISTICS) ?
+ UPDATE_STAT : 0) |
+ ((param->testflag & T_AUTO_INC) ?
+ UPDATE_AUTO_INC : 0));
+ }
+ if ((!rep_quick && !error) ||
+ !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+ {
+ VOID(init_io_cache(&param->read_cache,datafile,
+ (uint) param->read_buffer_length,
+ READ_CACHE,
+ (param->start_check_pos ?
+ param->start_check_pos :
+ share->pack.header_length),
+ 1,
+ MYF(MY_WME)));
+ maria_lock_memory(param);
+ if ((info->s->data_file_type != STATIC_RECORD) ||
+ (param->testflag & (T_EXTEND | T_MEDIUM)))
+ error|=maria_chk_data_link(param, info, param->testflag & T_EXTEND);
+ error|= _ma_flush_table_files_after_repair(param, info);
+ VOID(end_io_cache(&param->read_cache));
+ }
+ if (!error)
+ {
+ if ((share->state.changed & STATE_CHANGED) &&
+ (param->testflag & T_UPDATE_STATE))
+ info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR);
+ }
+ else if (!maria_is_crashed(info) &&
+ (param->testflag & T_UPDATE_STATE))
+ { /* Mark crashed */
+ maria_mark_crashed(info);
+ info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+ }
+
+ if ((param->testflag & T_AUTO_INC) ||
+ ((param->testflag & T_REP_ANY) && info->s->base.auto_key))
+ _ma_update_auto_increment_key(param, info,
+ (my_bool) !test(param->testflag & T_AUTO_INC));
+
+ if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY))
+ error|=maria_update_state_info(param, info,
+ UPDATE_OPEN_COUNT |
+ (((param->testflag & T_REP_ANY) ?
+ UPDATE_TIME : 0) |
+ (state_updated ? UPDATE_STAT : 0) |
+ ((param->testflag & T_SORT_RECORDS) ?
+ UPDATE_SORT : 0)));
+ info->update&= ~HA_STATE_CHANGED;
+ maria_lock_database(info, F_UNLCK);
+
+end2:
+ end_pagecache(maria_pagecache, 1);
+ if (maria_close(info))
+ {
+ _ma_check_print_error(param,"%d when closing MARIA-table '%s'",
+ my_errno,filename);
+ DBUG_RETURN(1);
+ }
+ if (error == 0)
+ {
+ if (param->out_flag & O_NEW_DATA)
+ error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+ ((param->testflag & T_BACKUP_DATA) ?
+ MYF(MY_REDEL_MAKE_BACKUP) : MYF(0)));
+ if (param->out_flag & O_NEW_INDEX)
+ error|=maria_change_to_newfile(filename,MARIA_NAME_IEXT,INDEX_TMP_EXT,
+ MYF(0));
+ }
+ VOID(fflush(stdout)); VOID(fflush(stderr));
+ if (param->error_printed)
+ {
+ if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX))
+ {
+ VOID(fprintf(stderr,
+ "MARIA-table '%s' is not fixed because of errors\n",
+ filename));
+ if (param->testflag & T_REP_ANY)
+ VOID(fprintf(stderr,
+ "Try fixing it by using the --safe-recover (-o), the --force (-f) option or by not using the --quick (-q) flag\n"));
+ }
+ else if (!(param->error_printed & 2) &&
+ !(param->testflag & T_FORCE_CREATE))
+ VOID(fprintf(stderr,
+ "MARIA-table '%s' is corrupted\nFix it using switch \"-r\" or \"-o\"\n",
+ filename));
+ }
+ else if (param->warning_printed &&
+ ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+ T_FORCE_CREATE)))
+ VOID(fprintf(stderr, "MARIA-table '%s' is usable but should be fixed\n",
+ filename));
+ VOID(fflush(stderr));
+ DBUG_RETURN(error);
+} /* maria_chk */
+
+
+/* Write info about table */
+
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+ uint key,keyseg_nr,field;
+ reg3 MARIA_KEYDEF *keyinfo;
+ reg2 HA_KEYSEG *keyseg;
+ reg4 const char *text;
+ char buff[160],length[10],*pos,*end;
+ enum en_fieldtype type;
+ MARIA_SHARE *share=info->s;
+ char llbuff[22],llbuff2[22];
+ DBUG_ENTER("describe");
+
+ if (param->testflag & T_VERY_SILENT)
+ {
+ longlong checksum= info->state->checksum;
+ if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+ checksum= 0;
+ printf("%s %s %s\n", name, llstr(info->state->records,llbuff),
+ llstr(checksum, llbuff2));
+ DBUG_VOID_RETURN;
+ }
+
+ printf("\nMARIA file: %s\n",name);
+ printf("Record format: %s\n", record_formats[share->data_file_type]);
+ printf("Character set: %s (%d)\n",
+ get_charset_name(share->state.header.language),
+ share->state.header.language);
+
+ if (param->testflag & T_VERBOSE)
+ {
+ printf("File-version: %d\n",
+ (int) share->state.header.file_version[3]);
+ if (share->state.create_time)
+ {
+ get_date(buff,1,share->state.create_time);
+ printf("Creation time: %s\n",buff);
+ }
+ if (share->state.check_time)
+ {
+ get_date(buff,1,share->state.check_time);
+ printf("Recover time: %s\n",buff);
+ }
+ pos=buff;
+ if (share->state.changed & STATE_CRASHED)
+ strmov(buff,"crashed");
+ else
+ {
+ if (share->state.open_count)
+ pos=strmov(pos,"open,");
+ if (share->state.changed & STATE_CHANGED)
+ pos=strmov(pos,"changed,");
+ else
+ pos=strmov(pos,"checked,");
+ if (!(share->state.changed & STATE_NOT_ANALYZED))
+ pos=strmov(pos,"analyzed,");
+ if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+ pos=strmov(pos,"optimized keys,");
+ if (!(share->state.changed & STATE_NOT_SORTED_PAGES))
+ pos=strmov(pos,"sorted index pages,");
+ pos[-1]=0; /* Remove extra ',' */
+ }
+ printf("Status: %s\n",buff);
+ if (share->base.auto_key)
+ {
+ printf("Auto increment key: %16d Last value: %18s\n",
+ share->base.auto_key,
+ llstr(share->state.auto_increment,llbuff));
+ }
+ if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ printf("Checksum: %26s\n",llstr(info->state->checksum,llbuff));
+;
+ if (share->options & HA_OPTION_DELAY_KEY_WRITE)
+ printf("Keys are only flushed at close\n");
+
+ }
+ printf("Data records: %16s Deleted blocks: %18s\n",
+ llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2));
+ if (param->testflag & T_SILENT)
+ DBUG_VOID_RETURN; /* This is enough */
+
+ if (param->testflag & T_VERBOSE)
+ {
+#ifdef USE_RELOC
+ printf("Init-relocation: %16s\n",llstr(share->base.reloc,llbuff));
+#endif
+ printf("Datafile parts: %16s Deleted data: %18s\n",
+ llstr(share->state.split,llbuff),
+ llstr(info->state->empty,llbuff2));
+ printf("Datafile pointer (bytes): %11d Keyfile pointer (bytes): %13d\n",
+ share->rec_reflength,share->base.key_reflength);
+ printf("Datafile length: %16s Keyfile length: %18s\n",
+ llstr(info->state->data_file_length,llbuff),
+ llstr(info->state->key_file_length,llbuff2));
+
+ if (info->s->base.reloc == 1L && info->s->base.records == 1L)
+ puts("This is a one-record table");
+ else
+ {
+ if (share->base.max_data_file_length != HA_OFFSET_ERROR ||
+ share->base.max_key_file_length != HA_OFFSET_ERROR)
+ printf("Max datafile length: %16s Max keyfile length: %18s\n",
+ llstr(share->base.max_data_file_length-1,llbuff),
+ llstr(share->base.max_key_file_length-1,llbuff2));
+ }
+ }
+ printf("Block_size: %16d\n",(int) share->block_size);
+ printf("Recordlength: %16d\n",(int) share->base.pack_reclength);
+ if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+ {
+ longlong2str(share->state.key_map,buff,2);
+ printf("Using only keys '%s' of %d possibly keys\n",
+ buff, share->base.keys);
+ }
+ puts("\ntable description:");
+ printf("Key Start Len Index Type");
+ if (param->testflag & T_VERBOSE)
+ printf(" Rec/key Root Blocksize");
+ VOID(putchar('\n'));
+
+ for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ;
+ key < share->base.keys;
+ key++,keyinfo++)
+ {
+ keyseg=keyinfo->seg;
+ if (keyinfo->flag & HA_NOSAME) text="unique ";
+ else if (keyinfo->flag & HA_FULLTEXT) text="fulltext ";
+ else text="multip.";
+
+ pos=buff;
+ if (keyseg->flag & HA_REVERSE_SORT)
+ *pos++ = '-';
+ pos=strmov(pos,type_names[keyseg->type]);
+ *pos++ = ' ';
+ *pos=0;
+ if (keyinfo->flag & HA_PACK_KEY)
+ pos=strmov(pos,prefix_packed_txt);
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ pos=strmov(pos,bin_packed_txt);
+ if (keyseg->flag & HA_SPACE_PACK)
+ pos=strmov(pos,diff_txt);
+ if (keyseg->flag & HA_BLOB_PART)
+ pos=strmov(pos,blob_txt);
+ if (keyseg->flag & HA_NULL_PART)
+ pos=strmov(pos,null_txt);
+ *pos=0;
+
+ printf("%-4d%-6ld%-3d %-8s%-21s",
+ key+1,(long) keyseg->start+1,keyseg->length,text,buff);
+ if (share->state.key_root[key] != HA_OFFSET_ERROR)
+ llstr(share->state.key_root[key],buff);
+ else
+ buff[0]=0;
+ if (param->testflag & T_VERBOSE)
+ printf("%11lu %12s %10d",
+ share->state.rec_per_key_part[keyseg_nr++],
+ buff,keyinfo->block_length);
+ VOID(putchar('\n'));
+ while ((++keyseg)->type != HA_KEYTYPE_END)
+ {
+ pos=buff;
+ if (keyseg->flag & HA_REVERSE_SORT)
+ *pos++ = '-';
+ pos=strmov(pos,type_names[keyseg->type]);
+ *pos++= ' ';
+ if (keyseg->flag & HA_SPACE_PACK)
+ pos=strmov(pos,diff_txt);
+ if (keyseg->flag & HA_BLOB_PART)
+ pos=strmov(pos,blob_txt);
+ if (keyseg->flag & HA_NULL_PART)
+ pos=strmov(pos,null_txt);
+ *pos=0;
+ printf(" %-6ld%-3d %-21s",
+ (long) keyseg->start+1,keyseg->length,buff);
+ if (param->testflag & T_VERBOSE)
+ printf("%11lu", share->state.rec_per_key_part[keyseg_nr++]);
+ VOID(putchar('\n'));
+ }
+ keyseg++;
+ }
+ if (share->state.header.uniques)
+ {
+ MARIA_UNIQUEDEF *uniqueinfo;
+ puts("\nUnique Key Start Len Nullpos Nullbit Type");
+ for (key=0,uniqueinfo= &share->uniqueinfo[0] ;
+ key < share->state.header.uniques; key++, uniqueinfo++)
+ {
+ my_bool new_row=0;
+ char null_bit[8],null_pos[8];
+ printf("%-8d%-5d",key+1,uniqueinfo->key+1);
+ for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++)
+ {
+ if (new_row)
+ fputs(" ",stdout);
+ null_bit[0]=null_pos[0]=0;
+ if (keyseg->null_bit)
+ {
+ sprintf(null_bit,"%d",keyseg->null_bit);
+ sprintf(null_pos,"%ld",(long) keyseg->null_pos+1);
+ }
+ printf("%-7ld%-5d%-9s%-10s%-30s\n",
+ (long) keyseg->start+1,keyseg->length,
+ null_pos,null_bit,
+ type_names[keyseg->type]);
+ new_row=1;
+ }
+ }
+ }
+ if (param->verbose > 1)
+ {
+ char null_bit[8],null_pos[8];
+ printf("\nField Start Length Nullpos Nullbit Type");
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ printf(" Huff tree Bits");
+ VOID(putchar('\n'));
+
+ for (field=0 ; field < share->base.fields ; field++)
+ {
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ type=share->columndef[field].base_type;
+ else
+ type=(enum en_fieldtype) share->columndef[field].type;
+ end=strmov(buff,field_pack[type]);
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ if (share->columndef[field].pack_type & PACK_TYPE_SELECTED)
+ end=strmov(end,", not_always");
+ if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS)
+ end=strmov(end,", no empty");
+ if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL)
+ {
+ sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits);
+ end=strend(end);
+ }
+ }
+ if (buff[0] == ',')
+ strmov(buff,buff+2);
+ int10_to_str((long) share->columndef[field].length,length,10);
+ null_bit[0]=null_pos[0]=0;
+ if (share->columndef[field].null_bit)
+ {
+ sprintf(null_bit,"%d",share->columndef[field].null_bit);
+ sprintf(null_pos,"%d",share->columndef[field].null_pos+1);
+ }
+ printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1,
+ (uint) share->columndef[field].offset+1,
+ length, null_pos, null_bit, buff);
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ if (share->columndef[field].huff_tree)
+ printf("%3d %2d",
+ (uint) (share->columndef[field].huff_tree-share->decode_trees)+1,
+ share->columndef[field].huff_tree->quick_table_bits);
+ }
+ VOID(putchar('\n'));
+ }
+ }
+ DBUG_VOID_RETURN;
+} /* describe */
+
+
+ /* Sort records according to one key */
+
+static int maria_sort_records(HA_CHECK *param,
+ register MARIA_HA *info, char *name,
+ uint sort_key,
+ my_bool write_info,
+ my_bool update_index)
+{
+ int got_error;
+ uint key;
+ MARIA_KEYDEF *keyinfo;
+ File new_file;
+ uchar *temp_buff;
+ ha_rows old_record_count;
+ MARIA_SHARE *share=info->s;
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO sort_info;
+ MARIA_SORT_PARAM sort_param;
+ DBUG_ENTER("sort_records");
+
+ bzero((char*)&sort_info,sizeof(sort_info));
+ bzero((char*)&sort_param,sizeof(sort_param));
+ sort_param.sort_info=&sort_info;
+ sort_info.param=param;
+ keyinfo= &share->keyinfo[sort_key];
+ got_error=1;
+ temp_buff=0;
+ new_file= -1;
+
+ if (! maria_is_key_active(share->state.key_map, sort_key))
+ {
+ _ma_check_print_warning(param,
+ "Can't sort table '%s' on key %d; No such key",
+ name,sort_key+1);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d",
+ name,sort_key+1);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (share->data_file_type == COMPRESSED_RECORD)
+ {
+ _ma_check_print_warning(param,"Can't sort read-only table '%s'", name);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- Sorting records for MARIA-table '%s'\n",name);
+ if (write_info)
+ printf("Data records: %9s Deleted: %9s\n",
+ llstr(info->state->records,llbuff),
+ llstr(info->state->del,llbuff2));
+ }
+ if (share->state.key_root[sort_key] == HA_OFFSET_ERROR)
+ DBUG_RETURN(0); /* Nothing to do */
+
+ if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length,
+ WRITE_CACHE,share->pack.header_length,1,
+ MYF(MY_WME | MY_WAIT_IF_FULL)))
+ goto err;
+ info->opt_flag|=WRITE_CACHE_USED;
+
+ if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not enough memory for key block");
+ goto err;
+ }
+ if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength,
+ MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for record");
+ goto err;
+ }
+ fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32);
+ new_file= my_create(fn_format(param->temp_filename,
+ param->temp_filename,"",
+ DATA_TMP_EXT,
+ MY_REPLACE_EXT | MY_UNPACK_FILENAME),
+ 0, param->tmpfile_createflag,
+ MYF(0));
+ if (new_file < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (share->pack.header_length)
+ if (maria_filecopy(param, new_file, info->dfile.file, 0L,
+ share->pack.header_length,
+ "datafile-header"))
+ goto err;
+ info->rec_cache.file=new_file; /* Use this file for cacheing*/
+
+ maria_lock_memory(param);
+ for (key=0 ; key < share->base.keys ; key++)
+ share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME;
+
+ if (my_pread(share->kfile.file, temp_buff,
+ (uint) keyinfo->block_length,
+ share->state.key_root[sort_key],
+ MYF(MY_NABP+MY_WME)))
+ {
+ _ma_check_print_error(param,"Can't read indexpage from filepos: %s",
+ (ulong) share->state.key_root[sort_key]);
+ goto err;
+ }
+
+ /* Setup param for _ma_sort_write_record */
+ sort_info.info=info;
+ sort_info.new_data_file_type=share->data_file_type;
+ sort_param.fix_datafile=1;
+ sort_param.master=1;
+ sort_param.filepos=share->pack.header_length;
+ old_record_count=info->state->records;
+ info->state->records=0;
+ if (sort_info.new_data_file_type != COMPRESSED_RECORD)
+ info->state->checksum=0;
+
+ if (sort_record_index(&sort_param,info,keyinfo,
+ share->state.key_root[sort_key],
+ temp_buff, sort_key,new_file,update_index) ||
+ maria_write_data_suffix(&sort_info,1) ||
+ flush_io_cache(&info->rec_cache))
+ goto err;
+
+ if (info->state->records != old_record_count)
+ {
+ _ma_check_print_error(param,"found %s of %s records",
+ llstr(info->state->records,llbuff),
+ llstr(old_record_count,llbuff2));
+ goto err;
+ }
+
+ VOID(my_close(info->dfile.file, MYF(MY_WME)));
+ param->out_flag|=O_NEW_DATA; /* Data in new file */
+ info->dfile.file= new_file; /* Use new datafile */
+ info->state->del=0;
+ info->state->empty=0;
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->state->data_file_length=sort_param.filepos;
+ share->state.split=info->state->records; /* Only hole records */
+ share->state.version=(ulong) time((time_t*) 0);
+
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ got_error=0;
+
+err:
+ if (got_error && new_file >= 0)
+ {
+ VOID(end_io_cache(&info->rec_cache));
+ (void) my_close(new_file,MYF(MY_WME));
+ (void) my_delete(param->temp_filename, MYF(MY_WME));
+ }
+ if (temp_buff)
+ {
+ my_afree((uchar*) temp_buff);
+ }
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ VOID(end_io_cache(&info->rec_cache));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ sort_info.buff=0;
+ share->state.sortkey=sort_key;
+ DBUG_RETURN(_ma_flush_table_files_after_repair(param, info) | got_error);
+} /* sort_records */
+
+
+/* Sort records recursive using one index */
+
+static int sort_record_index(MARIA_SORT_PARAM *sort_param,MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff, uint sort_key,
+ File new_file,my_bool update_index)
+{
+ uint nod_flag,used_length,key_length;
+ uchar *temp_buff,*keypos,*endpos;
+ my_off_t next_page,rec_pos;
+ uchar lastkey[HA_MAX_KEY_BUFF];
+ char llbuff[22];
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ DBUG_ENTER("sort_record_index");
+
+ nod_flag=_ma_test_if_nod(buff);
+ temp_buff=0;
+
+ if (nod_flag)
+ {
+ if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not Enough memory");
+ DBUG_RETURN(-1);
+ }
+ }
+ used_length= maria_data_on_page(buff);
+ keypos=buff+2+nod_flag;
+ endpos=buff+used_length;
+ for ( ;; )
+ {
+ _sanity(__FILE__,__LINE__);
+ if (nod_flag)
+ {
+ next_page= _ma_kpos(nod_flag, keypos);
+ if (my_pread(info->s->kfile.file, (uchar*)temp_buff,
+ (uint) keyinfo->block_length, next_page,
+ MYF(MY_NABP+MY_WME)))
+ {
+ _ma_check_print_error(param,"Can't read keys from filepos: %s",
+ llstr(next_page,llbuff));
+ goto err;
+ }
+ if (sort_record_index(sort_param, info,keyinfo,next_page,temp_buff,
+ sort_key,
+ new_file, update_index))
+ goto err;
+ }
+ _sanity(__FILE__,__LINE__);
+ if (keypos >= endpos ||
+ (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey))
+ == 0)
+ break;
+ rec_pos= _ma_dpos(info,0,lastkey+key_length);
+
+ if ((*info->s->read_record)(info,sort_param->record,rec_pos))
+ {
+ _ma_check_print_error(param,"%d when reading datafile",my_errno);
+ goto err;
+ }
+ if (rec_pos != sort_param->filepos && update_index)
+ {
+ _ma_dpointer(info,keypos-nod_flag-info->s->rec_reflength,
+ sort_param->filepos);
+ if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos,
+ sort_key))
+ {
+ _ma_check_print_error(param,"%d when updating key-pointers",my_errno);
+ goto err;
+ }
+ }
+ if (_ma_sort_write_record(sort_param))
+ goto err;
+ }
+ /* Clear end of block to get better compression if the table is backuped */
+ bzero((uchar*) buff+used_length,keyinfo->block_length-used_length);
+ if (my_pwrite(info->s->kfile.file, (uchar*)buff, (uint)keyinfo->block_length,
+ page,param->myf_rw))
+ {
+ _ma_check_print_error(param,"%d when updating keyblock",my_errno);
+ goto err;
+ }
+ if (temp_buff)
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(0);
+err:
+ if (temp_buff)
+ my_afree((uchar*) temp_buff);
+ DBUG_RETURN(1);
+} /* sort_record_index */
+
+
+
+/*
+ Check if maria_chk was killed by a signal
+ This is overloaded by other programs that want to be able to abort
+ sorting
+*/
+
+static int not_killed= 0;
+
+volatile int *_ma_killed_ptr(HA_CHECK *param __attribute__((unused)))
+{
+ return &not_killed; /* always NULL */
+}
+
+ /* print warnings and errors */
+ /* VARARGS */
+
+void _ma_check_print_info(HA_CHECK *param __attribute__((unused)),
+ const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_info");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ va_start(args,fmt);
+ VOID(vfprintf(stdout, fmt, args));
+ VOID(fputc('\n',stdout));
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_warning");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ fflush(stdout);
+ if (!param->warning_printed && !param->error_printed)
+ {
+ if (param->testflag & T_SILENT)
+ fprintf(stderr,"%s: MARIA file %s\n",my_progname_short,
+ param->isam_file_name);
+ param->out_flag|= O_DATA_LOST;
+ }
+ param->warning_printed=1;
+ va_start(args,fmt);
+ fprintf(stderr,"%s: warning: ",my_progname_short);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_error");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ fflush(stdout);
+ if (!param->warning_printed && !param->error_printed)
+ {
+ if (param->testflag & T_SILENT)
+ fprintf(stderr,"%s: MARIA file %s\n",my_progname_short,param->isam_file_name);
+ param->out_flag|= O_DATA_LOST;
+ }
+ param->error_printed|=1;
+ va_start(args,fmt);
+ fprintf(stderr,"%s: error: ",my_progname_short);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
new file mode 100644
index 00000000000..09852f4dc86
--- /dev/null
+++ b/storage/maria/maria_def.h
@@ -0,0 +1,958 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* This file is included by all internal maria files */
+
+#include "maria.h" /* Structs & some defines */
+#include <myisampack.h> /* packing of keys */
+#include <my_tree.h>
+#include <my_bitmap.h>
+#ifdef THREAD
+#include <my_pthread.h>
+#include <thr_lock.h>
+#else
+#include <my_no_pthread.h>
+#endif
+
+#include "ma_loghandler.h"
+#include "ma_control_file.h"
+
+#define MAX_NONMAPPED_INSERTS 1000
+#define MARIA_MAX_TREE_LEVELS 32
+#define SANITY_CHECKS
+
+struct st_transaction;
+
+/* undef map from my_nosys; We need test-if-disk full */
+#undef my_write
+
+typedef struct st_maria_status_info
+{
+ ha_rows records; /* Rows in table */
+ ha_rows del; /* Removed rows */
+ my_off_t empty; /* lost space in datafile */
+ my_off_t key_empty; /* lost space in indexfile */
+ my_off_t key_file_length;
+ my_off_t data_file_length;
+ ha_checksum checksum;
+} MARIA_STATUS_INFO;
+
+typedef struct st_maria_state_info
+{
+ struct
+ { /* Fileheader */
+ uchar file_version[4];
+ uchar options[2];
+ uchar header_length[2];
+ uchar state_info_length[2];
+ uchar base_info_length[2];
+ uchar base_pos[2];
+ uchar key_parts[2]; /* Key parts */
+ uchar unique_key_parts[2]; /* Key parts + unique parts */
+ uchar keys; /* number of keys in file */
+ uchar uniques; /* number of UNIQUE definitions */
+ uchar language; /* Language for indexes */
+ uchar fulltext_keys;
+ uchar data_file_type;
+ /* Used by mariapack to store the original data_file_type */
+ uchar org_data_file_type;
+ } header;
+
+ MARIA_STATUS_INFO state;
+ ha_rows split; /* number of split blocks */
+ my_off_t dellink; /* Link to next removed block */
+ ulonglong first_bitmap_with_space;
+ ulonglong auto_increment;
+ ulong process; /* process that updated table last */
+ ulong unique; /* Unique number for this process */
+ ulong update_count; /* Updated for each write lock */
+ ulong status;
+ ulong *rec_per_key_part;
+ ha_checksum checksum; /* Table checksum */
+ my_off_t *key_root; /* Start of key trees */
+ my_off_t key_del; /* delete links for index pages */
+ my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */
+
+ ulong sec_index_changed; /* Updated when new sec_index */
+ ulong sec_index_used; /* which extra index are in use */
+ ulonglong key_map; /* Which keys are in use */
+ ulong version; /* timestamp of create */
+ time_t create_time; /* Time when created database */
+ time_t recover_time; /* Time for last recover */
+ time_t check_time; /* Time for last check */
+ uint sortkey; /* sorted by this key (not used) */
+ uint open_count;
+ uint8 changed; /* Changed since mariachk */
+ LSN create_rename_lsn; /**< LSN when table was last created/renamed */
+ /** @brief Log horizon when state was last updated on disk */
+ TRANSLOG_ADDRESS is_of_horizon;
+
+ /* the following isn't saved on disk */
+ uint state_diff_length; /* Should be 0 */
+ uint state_length; /* Length of state header in file */
+ ulong *key_info;
+} MARIA_STATE_INFO;
+
+
+#define MARIA_STATE_INFO_SIZE \
+ (24 + LSN_STORE_SIZE*2 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8)
+#define MARIA_STATE_KEY_SIZE 8
+#define MARIA_STATE_KEYBLOCK_SIZE 8
+#define MARIA_STATE_KEYSEG_SIZE 4
+#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE)
+#define MARIA_KEYDEF_SIZE (2+ 5*2)
+#define MARIA_UNIQUEDEF_SIZE (2+1+1)
+#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2)
+#define MARIA_COLUMNDEF_SIZE (6+2+2+2+2+2+1+1)
+#define MARIA_BASE_INFO_SIZE (5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16)
+#define MARIA_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */
+/* Internal management bytes needed to store 2 keys on an index page */
+#define MARIA_INDEX_MIN_OVERHEAD_SIZE (4 + (TRANSID_SIZE+1) * 2)
+
+/*
+ Basic information of the Maria table. This is stored on disk
+ and not changed (unless we do DLL changes).
+*/
+
+typedef struct st_ma_base_info
+{
+ my_off_t keystart; /* Start of keys */
+ my_off_t max_data_file_length;
+ my_off_t max_key_file_length;
+ my_off_t margin_key_file_length;
+ ha_rows records, reloc; /* Create information */
+ ulong mean_row_length; /* Create information */
+ ulong reclength; /* length of unpacked record */
+ ulong pack_reclength; /* Length of full packed rec */
+ ulong min_pack_length;
+ ulong max_pack_length; /* Max possibly length of packed rec */
+ ulong min_block_length;
+ uint fields; /* fields in table */
+ uint fixed_not_null_fields;
+ uint fixed_not_null_fields_length;
+ uint max_field_lengths;
+ uint pack_fields; /* packed fields in table */
+ uint varlength_fields; /* char/varchar/blobs */
+ /* Number of bytes in the index used to refer to a row (2-8) */
+ uint rec_reflength;
+ /* Number of bytes in the index used to refer to another index page (2-8) */
+ uint key_reflength; /* = 2-8 */
+ uint keys; /* same as in state.header */
+ uint auto_key; /* Which key-1 is a auto key */
+ uint blobs; /* Number of blobs */
+ /* Length of packed bits (when table was created first time) */
+ uint pack_bytes;
+ /* Length of null bits (when table was created first time) */
+ uint original_null_bytes;
+ uint null_bytes; /* Null bytes in record */
+ uint field_offsets; /* Number of field offsets */
+ uint max_key_block_length; /* Max block length */
+ uint max_key_length; /* Max key length */
+ /* Extra allocation when using dynamic record format */
+ uint extra_alloc_bytes;
+ uint extra_alloc_procent;
+ uint is_nulls_extended; /* 1 if new null bytes */
+ uint min_row_length; /* Min possible length of a row */
+ uint default_row_flag; /* 0 or ROW_FLAG_NULLS_EXTENDED */
+ uint block_size;
+ /* Size of initial record buffer */
+ uint default_rec_buff_size;
+ /* Extra number of bytes the row format require in the record buffer */
+ uint extra_rec_buff_size;
+
+ /* The following are from the header */
+ uint key_parts, all_key_parts;
+ /**
+ @brief If false, we disable logging, versioning, transaction etc. Observe
+ difference with MARIA_SHARE::now_transactional
+ */
+ my_bool born_transactional;
+} MARIA_BASE_INFO;
+
+
+/* Structs used intern in database */
+
+typedef struct st_maria_blob /* Info of record */
+{
+ ulong offset; /* Offset to blob in record */
+ uint pack_length; /* Type of packed length */
+ ulong length; /* Calc:ed for each record */
+} MARIA_BLOB;
+
+
+typedef struct st_maria_pack
+{
+ ulong header_length;
+ uint ref_length;
+ uchar version;
+} MARIA_PACK;
+
+typedef struct st_maria_file_bitmap
+{
+ uchar *map;
+ ulonglong page; /* Page number for current bitmap */
+ uint used_size; /* Size of bitmap head that is not 0 */
+ my_bool changed; /* 1 if page needs to be flushed */
+ PAGECACHE_FILE file; /* datafile where bitmap is stored */
+
+#ifdef THREAD
+ pthread_mutex_t bitmap_lock;
+#endif
+ /* Constants, allocated when initiating bitmaps */
+ uint sizes[8]; /* Size per bit combination */
+ uint total_size; /* Total usable size of bitmap page */
+ uint block_size; /* Block size of file */
+ ulong pages_covered; /* Pages covered by bitmap + 1 */
+} MARIA_FILE_BITMAP;
+
+#define MARIA_CHECKPOINT_LOOKS_AT_ME 1
+#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2
+#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4
+
+typedef struct st_maria_share
+{ /* Shared between opens */
+ MARIA_STATE_INFO state;
+ MARIA_BASE_INFO base;
+ MARIA_KEYDEF ft2_keyinfo; /* Second-level ft-key
+ definition */
+ MARIA_KEYDEF *keyinfo; /* Key definitions */
+ MARIA_UNIQUEDEF *uniqueinfo; /* unique definitions */
+ HA_KEYSEG *keyparts; /* key part info */
+ MARIA_COLUMNDEF *columndef; /* Pointer to column information */
+ MARIA_PACK pack; /* Data about packed records */
+ MARIA_BLOB *blobs; /* Pointer to blobs */
+ char *unique_file_name; /* realpath() of index file */
+ char *data_file_name; /* Resolved path names from symlinks */
+ char *index_file_name;
+ char *open_file_name; /* parameter to open filename */
+ uchar *file_map; /* mem-map of file if possible */
+ PAGECACHE *pagecache; /* ref to the current key cache */
+ MARIA_DECODE_TREE *decode_trees;
+ uint16 *decode_tables;
+ uint16 id; /**< 2-byte id by which log records refer to the table */
+ /* Called the first time the table instance is opened */
+ my_bool (*once_init)(struct st_maria_share *, File);
+ /* Called when the last instance of the table is closed */
+ my_bool (*once_end)(struct st_maria_share *);
+ /* Is called for every open of the table */
+ my_bool (*init)(struct st_maria_info *);
+ /* Is called for every close of the table */
+ void (*end)(struct st_maria_info *);
+ /* Called when we want to read a record from a specific position */
+ int (*read_record)(struct st_maria_info *, uchar *, MARIA_RECORD_POS);
+ /* Initialize a scan */
+ my_bool (*scan_init)(struct st_maria_info *);
+ /* Read next record while scanning */
+ int (*scan)(struct st_maria_info *, uchar *, MARIA_RECORD_POS, my_bool);
+ /* End scan */
+ void (*scan_end)(struct st_maria_info *);
+ /* Pre-write of row (some handlers may do the actual write here) */
+ MARIA_RECORD_POS (*write_record_init)(struct st_maria_info *, const uchar *);
+ /* Write record (or accept write_record_init) */
+ my_bool (*write_record)(struct st_maria_info *, const uchar *);
+ /* Called when write failed */
+ my_bool (*write_record_abort)(struct st_maria_info *);
+ my_bool (*update_record)(struct st_maria_info *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+ my_bool (*delete_record)(struct st_maria_info *, const uchar *record);
+ my_bool (*compare_record)(struct st_maria_info *, const uchar *);
+ /* calculate checksum for a row */
+ ha_checksum(*calc_checksum)(struct st_maria_info *, const uchar *);
+ /*
+ Calculate checksum for a row during write. May be 0 if we calculate
+ the checksum in write_record_init()
+ */
+ ha_checksum(*calc_write_checksum)(struct st_maria_info *, const uchar *);
+ /* calculate checksum for a row during check table */
+ ha_checksum(*calc_check_checksum)(struct st_maria_info *, const uchar *);
+ /* Compare a row in memory with a row on disk */
+ my_bool (*compare_unique)(struct st_maria_info *, MARIA_UNIQUEDEF *,
+ const uchar *record, MARIA_RECORD_POS pos);
+ /* Mapings to read/write the data file */
+ uint (*file_read)(MARIA_HA *, uchar *, uint, my_off_t, myf);
+ uint (*file_write)(MARIA_HA *, uchar *, uint, my_off_t, myf);
+ invalidator_by_filename invalidator; /* query cache invalidator */
+ ulong this_process; /* processid */
+ ulong last_process; /* For table-change-check */
+ ulong last_version; /* Version on start */
+ ulong options; /* Options used */
+ ulong min_pack_length; /* These are used by packed data */
+ ulong max_pack_length;
+ ulong state_diff_length;
+ uint rec_reflength; /* rec_reflength in use now */
+ uint unique_name_length;
+ uint32 ftparsers; /* Number of distinct ftparsers
+ + 1 */
+ PAGECACHE_FILE kfile; /* Shared keyfile */
+ File data_file; /* Shared data file */
+ int mode; /* mode of file on open */
+ uint reopen; /* How many times reopened */
+ uint w_locks, r_locks, tot_locks; /* Number of read/write locks */
+ uint block_size; /* block_size of keyfile & data file*/
+ /* Fixed length part of a packed row in BLOCK_RECORD format */
+ uint base_length;
+ myf write_flag;
+ enum data_file_type data_file_type;
+ enum pagecache_page_type page_type; /* value depending transactional */
+ uint8 in_checkpoint; /**< if Checkpoint looking at table */
+ my_bool temporary;
+ /* Below flag is needed to make log tables work with concurrent insert */
+ my_bool is_log_table;
+
+ my_bool changed, /* If changed since lock */
+ global_changed, /* If changed since open */
+ not_flushed, concurrent_insert;
+ my_bool delay_key_write;
+ my_bool have_rtree;
+ /**
+ @brief if the table is transactional right now. It may have been created
+ transactional (base.born_transactional==TRUE) but with transactionality
+ (logging) temporarily disabled (now_transactional==FALSE). The opposite
+ (FALSE, TRUE) is impossible.
+ */
+ my_bool now_transactional;
+#ifdef THREAD
+ THR_LOCK lock;
+ pthread_mutex_t intern_lock; /* Locking for use with _locking */
+ rw_lock_t *key_root_lock;
+#endif
+ my_off_t mmaped_length;
+ uint nonmmaped_inserts; /* counter of writing in
+ non-mmaped area */
+ MARIA_FILE_BITMAP bitmap;
+ rw_lock_t mmap_lock;
+ LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */
+} MARIA_SHARE;
+
+
+typedef uchar MARIA_BITMAP_BUFFER;
+
+typedef struct st_maria_bitmap_block
+{
+ ulonglong page; /* Page number */
+ /* Number of continuous pages. TAIL_BIT is set if this is a tail page */
+ uint page_count;
+ uint empty_space; /* Set for head and tail pages */
+ /*
+ Number of BLOCKS for block-region (holds all non-blob-fields or one blob)
+ */
+ uint sub_blocks;
+ /* set to <> 0 in write_record() if this block was actually used */
+ uint8 used;
+ uint8 org_bitmap_value;
+} MARIA_BITMAP_BLOCK;
+
+
+typedef struct st_maria_bitmap_blocks
+{
+ MARIA_BITMAP_BLOCK *block;
+ uint count;
+ my_bool tail_page_skipped; /* If some tail pages was not used */
+ my_bool page_skipped; /* If some full pages was not used */
+} MARIA_BITMAP_BLOCKS;
+
+
+/* Data about the currently read row */
+typedef struct st_maria_row
+{
+ MARIA_BITMAP_BLOCKS insert_blocks;
+ MARIA_BITMAP_BUFFER *extents;
+ MARIA_RECORD_POS lastpos, nextpos;
+ MARIA_RECORD_POS *tail_positions;
+ ha_checksum checksum;
+ uchar *empty_bits, *field_lengths;
+ uint *null_field_lengths; /* All null field lengths */
+ ulong *blob_lengths; /* Length for each blob */
+ ulong base_length, normal_length, char_length, varchar_length, blob_length;
+ ulong head_length, total_length;
+ size_t extents_buffer_length; /* Size of 'extents' buffer */
+ uint field_lengths_length; /* Length of data in field_lengths */
+ uint extents_count; /* number of extents in 'extents' */
+ uint full_page_count, tail_count; /* For maria_chk */
+ uint space_on_head_page;
+} MARIA_ROW;
+
+/* Data to scan row in blocked format */
+typedef struct st_maria_block_scan
+{
+ uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff;
+ uchar *dir, *dir_end;
+ ulong bitmap_page;
+ ulonglong bits;
+ uint number_of_rows, bit_pos;
+ MARIA_RECORD_POS row_base_page;
+} MARIA_BLOCK_SCAN;
+
+
+struct st_maria_info
+{
+ MARIA_SHARE *s; /* Shared between open:s */
+ struct st_transaction *trn; /* Pointer to active transaction */
+ MARIA_STATUS_INFO *state, save_state;
+ MARIA_ROW cur_row; /* The active row that we just read */
+ MARIA_ROW new_row; /* Storage for a row during update */
+ MARIA_BLOCK_SCAN scan;
+ MARIA_BLOB *blobs; /* Pointer to blobs */
+ MARIA_BIT_BUFF bit_buff;
+ DYNAMIC_ARRAY bitmap_blocks;
+ DYNAMIC_ARRAY pinned_pages;
+ /* accumulate indexfile changes between write's */
+ TREE *bulk_insert;
+ LEX_STRING *log_row_parts; /* For logging */
+ DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */
+ MEM_ROOT ft_memroot; /* used by the parser */
+ MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */
+ uchar *buff; /* page buffer */
+ uchar *keyread_buff; /* Buffer for last key read */
+ uchar *lastkey, *lastkey2; /* Last used search key */
+ uchar *first_mbr_key; /* Searhed spatial key */
+ uchar *rec_buff; /* Temp buffer for recordpack */
+ uchar *int_keypos, /* Save position for next/previous */
+ *int_maxpos; /* -""- */
+ uchar *update_field_data; /* Used by update in rows-in-block */
+ uint int_nod_flag; /* -""- */
+ uint32 int_keytree_version; /* -""- */
+ int (*read_record) (struct st_maria_info *, uchar*, MARIA_RECORD_POS);
+ invalidator_by_filename invalidator; /* query cache invalidator */
+ ulong this_unique; /* uniq filenumber or thread */
+ ulong last_unique; /* last unique number */
+ ulong this_loop; /* counter for this open */
+ ulong last_loop; /* last used counter */
+ MARIA_RECORD_POS save_lastpos;
+ MARIA_RECORD_POS dup_key_pos;
+ my_off_t pos; /* Intern variable */
+ my_off_t last_keypage; /* Last key page read */
+ my_off_t last_search_keypage; /* Last keypage when searching */
+
+ /*
+ QQ: the folloing two xxx_length fields should be removed,
+ as they are not compatible with parallel repair
+ */
+ ulong packed_length, blob_length; /* Length of found, packed record */
+ size_t rec_buff_size;
+ PAGECACHE_FILE dfile; /* The datafile */
+ IO_CACHE rec_cache; /* When cacheing records */
+ LIST open_list;
+ MY_BITMAP changed_fields;
+ uint opt_flag; /* Optim. for space/speed */
+ uint update; /* If file changed since open */
+ int lastinx; /* Last used index */
+ uint lastkey_length; /* Length of key in lastkey */
+ uint last_rkey_length; /* Last length in maria_rkey() */
+ enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */
+ uint save_lastkey_length;
+ uint pack_key_length; /* For MARIAMRG */
+ uint16 last_used_keyseg; /* For MARIAMRG */
+ int errkey; /* Got last error on this key */
+ int lock_type; /* How database was locked */
+ int tmp_lock_type; /* When locked by readinfo */
+ uint data_changed; /* Somebody has changed data */
+ uint save_update; /* When using KEY_READ */
+ int save_lastinx;
+ uint preload_buff_size; /* When preloading indexes */
+ myf lock_wait; /* is 0 or MY_DONT_WAIT */
+ my_bool was_locked; /* Was locked in panic */
+ my_bool append_insert_at_end; /* Set if concurrent insert */
+ my_bool quick_mode;
+ /* If info->keyread_buff can't be used for rnext */
+ my_bool page_changed;
+ /* If info->keyread_buff has to be re-read for rnext */
+ my_bool keyread_buff_used;
+ my_bool once_flags; /* For MARIA_MRG */
+#ifdef __WIN__
+ my_bool owned_by_merge; /* This Maria table is part of a merge union */
+#endif
+#ifdef THREAD
+ THR_LOCK_DATA lock;
+#endif
+ uchar *maria_rtree_recursion_state; /* For RTREE */
+ uchar length_buff[5]; /* temp buff to store blob lengths */
+ int maria_rtree_recursion_depth;
+};
+
+/* Some defines used by maria-functions */
+
+#define USE_WHOLE_KEY 65535 /* Use whole key in _search() */
+#define F_EXTRA_LCK -1
+
+/* bits in opt_flag */
+#define MEMMAP_USED 32
+#define REMEMBER_OLD_POS 64
+
+#define WRITEINFO_UPDATE_KEYFILE 1
+#define WRITEINFO_NO_UNLOCK 2
+
+/* once_flags */
+#define USE_PACKED_KEYS 1
+#define RRND_PRESERVE_LASTINX 2
+
+/* bits in state.changed */
+
+#define STATE_CHANGED 1
+#define STATE_CRASHED 2
+#define STATE_CRASHED_ON_REPAIR 4
+#define STATE_NOT_ANALYZED 8
+#define STATE_NOT_OPTIMIZED_KEYS 16
+#define STATE_NOT_SORTED_PAGES 32
+#define STATE_NOT_OPTIMIZED_ROWS 64
+
+/* options to maria_read_cache */
+
+#define READING_NEXT 1
+#define READING_HEADER 2
+
+#define maria_data_on_page(x) ((uint) mi_uint2korr(x) & 32767)
+#define maria_putint(x,y,nod) { uint16 boh=(nod ? (uint16) 32768 : 0) + (uint16) (y);\
+ mi_int2store(x,boh); }
+#define _ma_test_if_nod(x) (x[0] & 128 ? info->s->base.key_reflength : 0)
+#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
+ DBUG_PRINT("error", ("Marked table crashed")); \
+ }while(0)
+#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|= \
+ STATE_CRASHED|STATE_CRASHED_ON_REPAIR; \
+ (x)->update|= HA_STATE_CHANGED; \
+ DBUG_PRINT("error", \
+ ("Marked table crashed")); \
+ }while(0)
+#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED)
+#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR)
+#ifdef EXTRA_DEBUG
+#define maria_print_error(SHARE, ERRNO) \
+ _ma_report_error((ERRNO), (SHARE)->index_file_name)
+#else
+#define maria_print_error(SHARE, ERRNO) while (0)
+#endif
+
+
+/* Functions to store length of space packed keys, VARCHAR or BLOB keys */
+
+#define store_key_length(key,length) \
+{ if ((length) < 255) \
+ { *(key)=(length); } \
+ else \
+ { *(key)=255; mi_int2store((key)+1,(length)); } \
+}
+
+#define get_key_full_length(length,key) \
+ { if (*(uchar*) (key) != 255) \
+ length= ((uint) *(uchar*) ((key)++))+1; \
+ else \
+ { length=mi_uint2korr((key)+1)+3; (key)+=3; } \
+}
+
+#define get_key_full_length_rdonly(length,key) \
+{ if (*(uchar*) (key) != 255) \
+ length= ((uint) *(uchar*) ((key)))+1; \
+ else \
+ { length=mi_uint2korr((key)+1)+3; } \
+}
+
+#define maria_max_key_length() ((maria_block_size - MARIA_INDEX_MIN_OVERHEAD_SIZE)/2)
+#define get_pack_length(length) ((length) >= 255 ? 3 : 1)
+
+#define MARIA_MIN_BLOCK_LENGTH 20 /* Because of delete-link */
+/* Don't use to small record-blocks */
+#define MARIA_EXTEND_BLOCK_LENGTH 20
+#define MARIA_SPLIT_LENGTH ((MARIA_EXTEND_BLOCK_LENGTH+4)*2)
+ /* Max prefix of record-block */
+#define MARIA_MAX_DYN_BLOCK_HEADER 20
+#define MARIA_BLOCK_INFO_HEADER_LENGTH 20
+#define MARIA_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */
+#define MARIA_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L)
+#define MARIA_DYN_MAX_ROW_LENGTH (MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH)
+#define MARIA_DYN_ALIGN_SIZE 4 /* Align blocks on this */
+#define MARIA_MAX_DYN_HEADER_BYTE 13 /* max header uchar for dynamic rows */
+#define MARIA_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)))
+#define MARIA_REC_BUFF_OFFSET ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32))
+
+#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */
+
+#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */
+#define PACK_TYPE_SPACE_FIELDS 2
+#define PACK_TYPE_ZERO_FILL 4
+#define MARIA_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */
+
+#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
+#define MARIA_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */
+#define MARIA_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */
+
+#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */
+#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100
+#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100
+#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10
+
+/* The UNIQUE check is done with a hashed long key */
+
+#define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT
+#define maria_unique_store(A,B) mi_int4store((A),(B))
+
+#ifdef THREAD
+extern pthread_mutex_t THR_LOCK_maria;
+#endif
+#if !defined(THREAD) || defined(DONT_USE_RW_LOCKS)
+#define rw_wrlock(A) {}
+#define rw_rdlock(A) {}
+#define rw_unlock(A) {}
+#endif
+
+
+/* Some extern variables */
+extern LIST *maria_open_list;
+extern uchar NEAR maria_file_magic[], NEAR maria_pack_file_magic[];
+extern uint NEAR maria_read_vec[], NEAR maria_readnext_vec[];
+extern uint maria_quick_table_bits;
+extern const char *maria_data_root;
+extern uchar maria_zero_string[];
+extern my_bool maria_inited;
+
+
+/* This is used by _ma_calc_xxx_key_length och _ma_store_key */
+typedef struct st_maria_s_param
+{
+ uint ref_length, key_length, n_ref_length;
+ uint n_length, totlength, part_of_prev_key, prev_length, pack_marker;
+ const uchar *key;
+ uchar *prev_key, *next_key_pos;
+ bool store_not_null;
+} MARIA_KEY_PARAM;
+
+
+/* Used to store reference to pinned page */
+typedef struct st_pinned_page
+{
+ PAGECACHE_PAGE_LINK link;
+ enum pagecache_page_lock unlock;
+} MARIA_PINNED_PAGE;
+
+
+/* Prototypes for intern functions */
+extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record);
+extern int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key,
+ uint length);
+extern int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length,
+ MARIA_RECORD_POS *root, uint comp_flag);
+extern int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, MARIA_RECORD_POS *root);
+extern int _ma_insert(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uchar *anc_buff, uchar *key_pos, uchar *key_buff,
+ uchar *father_buff, uchar *father_keypos,
+ my_off_t father_page, my_bool insert_last);
+extern int _ma_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uchar *buff, uchar *key_buff,
+ my_bool insert_last);
+extern uchar *_ma_find_half_pos(uint nod_flag, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key,
+ uint *return_key_length,
+ uchar ** after_key);
+extern int _ma_calc_static_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff, const uchar *key,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff, const uchar *key,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_pack_key_length(MARIA_KEYDEF *keyinfo,
+ uint nod_flag, uchar *key_pos,
+ uchar *org_key, uchar *prev_key,
+ const uchar *key,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_bin_pack_key_length(MARIA_KEYDEF *keyinfo,
+ uint nod_flag, uchar *key_pos,
+ uchar *org_key, uchar *prev_key,
+ const uchar *key,
+ MARIA_KEY_PARAM *s_temp);
+void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+#ifdef NOT_USED
+void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+#endif
+void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+
+extern int _ma_ck_delete(MARIA_HA *info, uint keynr, uchar *key,
+ uint key_length);
+extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer);
+extern int _ma_writeinfo(MARIA_HA *info, uint options);
+extern int _ma_test_if_changed(MARIA_HA *info);
+extern int _ma_mark_file_changed(MARIA_HA *info);
+extern int _ma_decrement_open_count(MARIA_HA *info);
+extern int _ma_check_index(MARIA_HA *info, int inx);
+extern int _ma_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key,
+ uint key_len, uint nextflag, my_off_t pos);
+extern int _ma_bin_search(struct st_maria_info *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uint key_len,
+ uint comp_flag, uchar **ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _ma_seq_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uint key_len,
+ uint comp_flag, uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _ma_prefix_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uint key_len,
+ uint comp_flag, uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern my_off_t _ma_kpos(uint nod_flag, uchar *after_key);
+extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos);
+extern MARIA_RECORD_POS _ma_dpos(MARIA_HA *info, uint nod_flag,
+ const uchar *after_key);
+extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *info, uchar *ptr);
+extern void _ma_dpointer(MARIA_HA *info, uchar *buff, MARIA_RECORD_POS pos);
+extern uint _ma_get_static_key(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar **page, uchar *key);
+extern uint _ma_get_pack_key(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar **page, uchar *key);
+extern uint _ma_get_binary_pack_key(MARIA_KEYDEF *keyinfo, uint nod_flag,
+ uchar ** page_pos, uchar *key);
+extern uchar *_ma_get_last_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *keypos, uchar *lastkey,
+ uchar *endpos, uint *return_key_length);
+extern uchar *_ma_get_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uchar *keypos,
+ uint *return_key_length);
+extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key);
+extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key,
+ HA_KEYSEG *end);
+extern uchar *_ma_move_key(MARIA_KEYDEF *keyinfo, uchar *to, const uchar *from);
+extern int _ma_search_next(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uchar *key, uint key_length, uint nextflag,
+ my_off_t pos);
+extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos);
+extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos);
+extern uchar *_ma_fetch_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t page, int level, uchar *buff,
+ int return_buffer);
+extern int _ma_write_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t page, int level, uchar *buff);
+extern int _ma_dispose(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos,
+ int level);
+extern my_off_t _ma_new(MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level);
+extern uint _ma_make_key(MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *record, MARIA_RECORD_POS filepos);
+extern uint _ma_pack_key(MARIA_HA *info, uint keynr, uchar *key,
+ const uchar *old, key_part_map keypart_map,
+ HA_KEYSEG ** last_used_keyseg);
+extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS);
+extern int _ma_read_cache(IO_CACHE *info, uchar *buff, MARIA_RECORD_POS pos,
+ uint length, int re_read_if_possibly);
+extern ulonglong ma_retrieve_auto_increment(MARIA_HA *info, const uchar *record);
+
+extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+ size_t new_size);
+extern ulong _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from,
+ ulong reclength);
+extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record,
+ uchar *packpos, ulong packed_length,
+ my_bool with_checkum, ha_checksum checksum);
+extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos,
+ ulong length, my_off_t next_filepos,
+ uchar ** record, ulong *reclength,
+ int *flag);
+extern void _ma_print_key(FILE *stream, HA_KEYSEG *keyseg,
+ const uchar *key, uint length);
+extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile);
+extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share);
+extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos);
+extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *from, ulong reclength);
+extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b);
+extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+ const uchar *oldrec, const uchar *newrec,
+ my_off_t pos);
+
+/*
+ Parameter to _ma_get_block_info
+ The dynamic row header is read into this struct. For an explanation of
+ the fields, look at the function _ma_get_block_info().
+*/
+
+typedef struct st_maria_block_info
+{
+ uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH];
+ ulong rec_len;
+ ulong data_len;
+ ulong block_len;
+ ulong blob_len;
+ MARIA_RECORD_POS filepos;
+ MARIA_RECORD_POS next_filepos;
+ MARIA_RECORD_POS prev_filepos;
+ uint second_read;
+ uint offset;
+} MARIA_BLOCK_INFO;
+
+
+/* bits in return from _ma_get_block_info */
+
+#define BLOCK_FIRST 1
+#define BLOCK_LAST 2
+#define BLOCK_DELETED 4
+#define BLOCK_ERROR 8 /* Wrong data */
+#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */
+#define BLOCK_FATAL_ERROR 32 /* hardware-error */
+
+#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
+#define MAXERR 20
+#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */
+#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
+#define INDEX_TMP_EXT ".TMM"
+#define DATA_TMP_EXT ".TMD"
+
+#define UPDATE_TIME 1
+#define UPDATE_STAT 2
+#define UPDATE_SORT 4
+#define UPDATE_AUTO_INC 8
+#define UPDATE_OPEN_COUNT 16
+
+#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE)
+#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD)
+#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD)
+#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD)
+
+#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0)
+#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1)
+
+extern uint _ma_get_block_info(MARIA_BLOCK_INFO *, File, my_off_t);
+extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from);
+extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info, uchar **rec_buff_p,
+ size_t *rec_buff_size,
+ File file, my_off_t filepos);
+extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _ma_report_error(int errcode, const char *file_name);
+extern my_bool _ma_memmap_file(MARIA_HA *info);
+extern void _ma_unmap_file(MARIA_HA *info);
+extern uint _ma_save_pack_length(uint version, uchar * block_buff,
+ ulong length);
+extern uint _ma_calc_pack_length(uint version, ulong length);
+extern ulong _ma_calc_blob_length(uint length, const uchar *pos);
+extern uint _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags);
+extern uint _ma_mmap_pwrite(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags);
+extern uint _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags);
+extern uint _ma_nommap_pwrite(MARIA_HA *info, uchar *Buffer,
+ uint Count, my_off_t offset, myf MyFlags);
+
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite);
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite);
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state);
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base);
+int _ma_keyseg_write(File file, const HA_KEYSEG *keyseg);
+char *_ma_keyseg_read(char *ptr, HA_KEYSEG *keyseg);
+uint _ma_keydef_write(File file, MARIA_KEYDEF *keydef);
+char *_ma_keydef_read(char *ptr, MARIA_KEYDEF *keydef);
+uint _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef);
+char *_ma_uniquedef_read(char *ptr, MARIA_UNIQUEDEF *keydef);
+uint _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef);
+char *_ma_columndef_read(char *ptr, MARIA_COLUMNDEF *columndef);
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record);
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf);
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf);
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ uchar *record, ha_checksum unique_hash,
+ MARIA_RECORD_POS pos);
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf);
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+ my_bool null_are_equal);
+void _ma_get_status(void *param, int concurrent_insert);
+void _ma_update_status(void *param);
+void _ma_restore_status(void *param);
+void _ma_copy_status(void *to, void *from);
+my_bool _ma_check_status(void *param);
+void _ma_reset_status(MARIA_HA *maria);
+#include "ma_commit.h"
+
+extern MARIA_HA *_ma_test_if_reopen(char *filename);
+my_bool _ma_check_table_is_closed(const char *name, const char *where);
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, File file_to_dup);
+int _ma_open_keyfile(MARIA_SHARE *share);
+void _ma_setup_functions(register MARIA_SHARE *share);
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size);
+void _ma_remap_file(MARIA_HA *info, my_off_t size);
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_default(MARIA_HA *info);
+
+C_MODE_START
+#define MARIA_FLUSH_DATA 1
+#define MARIA_FLUSH_INDEX 2
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index);
+/* Functions needed by _ma_check (are overrided in MySQL) */
+volatile int *_ma_killed_ptr(HA_CHECK *param);
+void _ma_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...));
+void _ma_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...));
+void _ma_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...));
+C_MODE_END
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param);
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param);
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param);
+#ifdef THREAD
+pthread_handler_t _ma_thr_find_all_keys(void *arg);
+#endif
+int _ma_flush_table_files_after_repair(HA_CHECK *param, MARIA_HA *info);
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param);
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+ ulong);
+int _ma_sync_table_files(const MARIA_HA *info);
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile);
+int _ma_update_create_rename_lsn(MARIA_SHARE *share,
+ LSN lsn, my_bool do_sync);
+int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share,
+ LSN lsn, my_bool do_sync);
+
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn);
+#define _ma_tmp_disable_logging_for_table(S) \
+ { (S)->now_transactional= FALSE; (S)->page_type= PAGECACHE_PLAIN_PAGE; }
+#define _ma_reenable_logging_for_table(S) \
+ { if (((S)->now_transactional= (S)->base.born_transactional)) \
+ (S)->page_type= PAGECACHE_LSN_PAGE; }
+
+extern PAGECACHE *maria_log_pagecache;
diff --git a/storage/maria/maria_ftdump.c b/storage/maria/maria_ftdump.c
new file mode 100644
index 00000000000..9df86b50474
--- /dev/null
+++ b/storage/maria/maria_ftdump.c
@@ -0,0 +1,279 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include <my_getopt.h>
+
+static void usage();
+static void complain(int val);
+static my_bool get_one_option(int, const struct my_option *, char *);
+
+static int count=0, stats=0, dump=0, lstats=0;
+static my_bool verbose;
+static char *query=NULL;
+static uint lengths[256];
+
+#define MAX_LEN (HA_FT_MAXBYTELEN+10)
+#define HOW_OFTEN_TO_WRITE 10000
+
+static struct my_option my_long_options[] =
+{
+ {"help", 'h', "Display help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?', "Synonym for -h.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"count", 'c', "Calculate per-word stats (counts and global weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"dump", 'd', "Dump index (incl. data offsets and word weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"length", 'l', "Report length distribution.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"stats", 's', "Report global stats.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Be verbose.",
+ (uchar**) &verbose, (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+int main(int argc,char *argv[])
+{
+ int error=0, subkeys;
+ uint keylen, keylen2=0, inx, doc_cnt=0;
+ float weight= 1.0;
+ double gws, min_gws=0, avg_gws=0;
+ MARIA_HA *info;
+ char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
+ ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
+ struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */
+
+ MY_INIT(argv[0]);
+ if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(error);
+ maria_init();
+ if (count || dump)
+ verbose=0;
+ if (!count && !dump && !lstats && !query)
+ stats=1;
+
+ if (verbose)
+ setbuf(stdout,NULL);
+
+ if (argc < 2)
+ usage();
+
+ {
+ char *end;
+ inx= (uint) strtoll(argv[1], &end, 10);
+ if (*end)
+ usage();
+ }
+
+ init_pagecache(maria_pagecache, USE_BUFFER_INIT, 0, 0,
+ MARIA_KEY_BLOCK_LENGTH);
+
+ if (!(info=maria_open(argv[0], O_RDONLY,
+ HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
+ {
+ error=my_errno;
+ goto err;
+ }
+
+ *buf2=0;
+ aio->info=info;
+
+ if ((inx >= info->s->base.keys) ||
+ !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
+ {
+ printf("Key %d in table %s is not a FULLTEXT key\n", inx, info->s->open_file_name);
+ goto err;
+ }
+
+ maria_lock_database(info, F_EXTRA_LCK);
+
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_PREV_FOUND;
+
+ while (!(error=maria_rnext(info,NULL,inx)))
+ {
+ keylen=*(info->lastkey);
+
+ subkeys=ft_sintXkorr(info->lastkey+keylen+1);
+ if (subkeys >= 0)
+ weight=*(float*)&subkeys;
+
+#ifdef HAVE_SNPRINTF
+ snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1);
+#else
+ sprintf(buf,"%.*s",(int) keylen,info->lastkey+1);
+#endif
+ my_casedn_str(default_charset_info,buf);
+ total++;
+ lengths[keylen]++;
+
+ if (count || stats)
+ {
+ if (strcmp(buf, buf2))
+ {
+ if (*buf2)
+ {
+ uniq++;
+ avg_gws+=gws=GWS_IN_USE;
+ if (count)
+ printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+ if (maxlen<keylen2)
+ {
+ maxlen=keylen2;
+ strmov(buf_maxlen, buf2);
+ }
+ if (max_doc_cnt < doc_cnt)
+ {
+ max_doc_cnt=doc_cnt;
+ strmov(buf_min_gws, buf2);
+ min_gws=gws;
+ }
+ }
+ strmov(buf2, buf);
+ keylen2=keylen;
+ doc_cnt=0;
+ }
+ doc_cnt+= (subkeys >= 0 ? 1 : -subkeys);
+ }
+ if (dump)
+ {
+ if (subkeys>=0)
+ printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf);
+ else
+ printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys,buf);
+ }
+ if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
+ printf("%10ld\r",total);
+ }
+ maria_lock_database(info, F_UNLCK);
+
+ if (count || stats)
+ {
+ if (*buf2)
+ {
+ uniq++;
+ avg_gws+=gws=GWS_IN_USE;
+ if (count)
+ printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+ if (maxlen<keylen2)
+ {
+ maxlen=keylen2;
+ strmov(buf_maxlen, buf2);
+ }
+ if (max_doc_cnt < doc_cnt)
+ {
+ max_doc_cnt=doc_cnt;
+ strmov(buf_min_gws, buf2);
+ min_gws=gws;
+ }
+ }
+ }
+
+ if (stats)
+ {
+ count=0;
+ for (inx=0;inx<256;inx++)
+ {
+ count+=lengths[inx];
+ if ((ulong) count >= total/2)
+ break;
+ }
+ printf("Total rows: %lu\nTotal words: %lu\n"
+ "Unique words: %lu\nLongest word: %lu chars (%s)\n"
+ "Median length: %u\n"
+ "Average global weight: %f\n"
+ "Most common word: %lu times, weight: %f (%s)\n",
+ (long) info->state->records, total, uniq, maxlen, buf_maxlen,
+ inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
+ }
+ if (lstats)
+ {
+ count=0;
+ for (inx=0; inx<256; inx++)
+ {
+ count+=lengths[inx];
+ if (count && lengths[inx])
+ printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
+ (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
+ 100.0*count/total);
+ }
+ }
+
+err:
+ if (error && error != HA_ERR_END_OF_FILE)
+ printf("got error %d\n",my_errno);
+ if (info)
+ maria_close(info);
+ maria_end();
+ return 0;
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'd':
+ dump=1;
+ complain(count || query);
+ break;
+ case 's':
+ stats=1;
+ complain(query!=0);
+ break;
+ case 'c':
+ count= 1;
+ complain(dump || query);
+ break;
+ case 'l':
+ lstats=1;
+ complain(query!=0);
+ break;
+ case '?':
+ case 'h':
+ usage();
+ }
+ return 0;
+}
+
+#include <help_start.h>
+
+static void usage()
+{
+ printf("Use: maria_ft_dump <table_name> <index_num>\n");
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+ NETWARE_SET_SCREEN_MODE(1);
+ exit(1);
+}
+
+#include <help_end.h>
+
+static void complain(int val) /* Kinda assert :-) */
+{
+ if (val)
+ {
+ printf("You cannot use these options together!\n");
+ exit(1);
+ }
+}
diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c
new file mode 100644
index 00000000000..83f88fcb0dc
--- /dev/null
+++ b/storage/maria/maria_pack.c
@@ -0,0 +1,3227 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Pack MARIA file */
+
+#ifndef USE_MY_FUNC
+#define USE_MY_FUNC /* We need at least my_malloc */
+#endif
+
+#include "maria_def.h"
+#include <queues.h>
+#include <my_tree.h>
+#include "mysys_err.h"
+#ifdef MSDOS
+#include <io.h>
+#endif
+#ifndef __GNU_LIBRARY__
+#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
+#endif
+#include <my_getopt.h>
+#include <assert.h>
+
+#if SIZEOF_LONG_LONG > 4
+#define BITS_SAVED 64
+#else
+#define BITS_SAVED 32
+#endif
+
+#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
+#define HEAD_LENGTH 32
+#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
+
+#define DATA_TMP_EXT ".TMD"
+#define OLD_EXT ".OLD"
+#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
+
+struct st_file_buffer {
+ File file;
+ uchar *buffer,*pos,*end;
+ my_off_t pos_in_file;
+ int bits;
+ ulonglong bitbucket;
+};
+
+struct st_huff_tree;
+struct st_huff_element;
+
+typedef struct st_huff_counts {
+ uint field_length,max_zero_fill;
+ uint pack_type;
+ uint max_end_space,max_pre_space,length_bits,min_space;
+ ulong max_length;
+ enum en_fieldtype field_type;
+ struct st_huff_tree *tree; /* Tree for field */
+ my_off_t counts[256];
+ my_off_t end_space[8];
+ my_off_t pre_space[8];
+ my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
+ TREE int_tree; /* Tree for detecting distinct column values. */
+ uchar *tree_buff; /* Column values, 'field_length' each. */
+ uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
+} HUFF_COUNTS;
+
+typedef struct st_huff_element HUFF_ELEMENT;
+
+/*
+ WARNING: It is crucial for the optimizations in calc_packed_length()
+ that 'count' is the first element of 'HUFF_ELEMENT'.
+*/
+struct st_huff_element {
+ my_off_t count;
+ union un_element {
+ struct st_nod {
+ HUFF_ELEMENT *left,*right;
+ } nod;
+ struct st_leaf {
+ HUFF_ELEMENT *null;
+ uint element_nr; /* Number of element */
+ } leaf;
+ } a;
+};
+
+
+typedef struct st_huff_tree {
+ HUFF_ELEMENT *root,*element_buffer;
+ HUFF_COUNTS *counts;
+ uint tree_number;
+ uint elements;
+ my_off_t bytes_packed;
+ uint tree_pack_length;
+ uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
+ ulonglong *code;
+ uchar *code_len;
+} HUFF_TREE;
+
+
+typedef struct st_isam_mrg {
+ MARIA_HA **file,**current,**end;
+ uint free_file;
+ uint count;
+ uint min_pack_length; /* Theese is used by packed data */
+ uint max_pack_length;
+ uint ref_length;
+ uint max_blob_length;
+ my_off_t records;
+ /* true if at least one source file has at least one disabled index */
+ my_bool src_file_has_indexes_disabled;
+} PACK_MRG_INFO;
+
+
+extern int main(int argc,char * *argv);
+static void get_options(int *argc,char ***argv);
+static MARIA_HA *open_isam_file(char *name,int mode);
+static bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
+static int compress(PACK_MRG_INFO *file,char *join_name);
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
+ uint trees,
+ HUFF_COUNTS *huff_counts,
+ uint fields);
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+ const uchar *s,const uchar *t);
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
+static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
+ my_off_t records);
+static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
+ uint max_space_length,my_off_t *space_counts,
+ my_off_t tot_space_count,
+ enum en_fieldtype field_type);
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
+static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
+static int save_counts_in_queue(uchar *key,element_count count,
+ HUFF_TREE *tree);
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
+static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,uint size,
+ ulonglong code);
+static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
+ my_off_t tot_elements,my_off_t filelength);
+static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,
+ uint *offset);
+static uint max_bit(uint value);
+static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
+static char *make_new_name(char *new_name,char *old_name);
+static char *make_old_name(char *new_name,char *old_name);
+static void init_file_buffer(File file,pbool read_buffer);
+static int flush_buffer(ulong neaded_length);
+static void end_file_buffer(void);
+static void write_bits(ulonglong value, uint bits);
+static void flush_bits(void);
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
+ ha_checksum crc);
+static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
+ ha_checksum crc);
+static int mrg_close(PACK_MRG_INFO *mrg);
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
+static void mrg_reset(PACK_MRG_INFO *mrg);
+#if !defined(DBUG_OFF)
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
+static int fakecmp(my_off_t **count1, my_off_t **count2);
+#endif
+
+
+static int error_on_write=0,test_only=0,verbose=0,silent=0,
+ write_loop=0,force_pack=0, isamchk_neaded=0;
+static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+static my_bool backup, opt_wait;
+/*
+ tree_buff_length is somewhat arbitrary. The bigger it is the better
+ the chance to win in terms of compression factor. On the other hand,
+ this table becomes part of the compressed file header. And its length
+ is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
+*/
+static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
+static char tmp_dir[FN_REFLEN]={0},*join_table;
+static my_off_t intervall_length;
+static ha_checksum glob_crc;
+static struct st_file_buffer file_buffer;
+static QUEUE queue;
+static HUFF_COUNTS *global_count;
+static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+static const char *load_default_groups[]= { "mariapack",0 };
+
+ /* The main program */
+
+int main(int argc, char **argv)
+{
+ int error,ok;
+ PACK_MRG_INFO merge;
+ char **default_argv;
+ MY_INIT(argv[0]);
+
+ load_defaults("my",load_default_groups,&argc,&argv);
+ default_argv= argv;
+ get_options(&argc,&argv);
+ maria_init();
+
+ error=ok=isamchk_neaded=0;
+ if (join_table)
+ { /* Join files into one */
+ if (open_isam_files(&merge,argv,(uint) argc) ||
+ compress(&merge,join_table))
+ error=1;
+ }
+ else while (argc--)
+ {
+ MARIA_HA *isam_file;
+ if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
+ error=1;
+ else
+ {
+ merge.file= &isam_file;
+ merge.current=0;
+ merge.free_file=0;
+ merge.count=1;
+ if (compress(&merge,0))
+ error=1;
+ else
+ ok=1;
+ }
+ }
+ if (ok && isamchk_neaded && !silent)
+ puts("Remember to run maria_chk -rq on compressed tables");
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ free_defaults(default_argv);
+ maria_end();
+ my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+ exit(error ? 2 : 0);
+#ifndef _lint
+ return 0; /* No compiler warning */
+#endif
+}
+
+enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
+
+static struct my_option my_long_options[] =
+{
+#ifdef __NETWARE__
+ {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"backup", 'b', "Make a backup of the table as table_name.OLD.",
+ (uchar**) &backup, (uchar**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR_MP,
+ "Directory where character sets are.", (uchar**) &charsets_dir,
+ (uchar**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"force", 'f',
+ "Force packing of table even if it gets bigger or if tempfile exists.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"join", 'j',
+ "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
+ (uchar**) &join_table, (uchar**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
+ 0, 0, 0},
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's', "Be more silent.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"tmpdir", 'T', "Use temporary directory to store temporary table.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"test", 't', "Don't pack table, only test packing it.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Output version information and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"wait", 'w', "Wait and retry if table is in use.", (uchar**) &opt_wait,
+ (uchar**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ VOID(printf("%s Ver 1.0 for %s on %s\n",
+ my_progname, SYSTEM_TYPE, MACHINE_TYPE));
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("Copyright (C) 2002 MySQL AB");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Pack a MARIA-table to take much less space.");
+ puts("Keys are not updated, you must run maria_chk -rq on the index (.MAI) file");
+ puts("afterwards to update the keys.");
+ puts("You should give the .MAI file as the filename argument.");
+ puts("To unpack a packed table, run maria_chk -u on the table");
+
+ VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ uint length;
+
+ switch(optid) {
+#ifdef __NETWARE__
+ case OPT_AUTO_CLOSE:
+ setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+ break;
+#endif
+ case 'f':
+ force_pack= 1;
+ tmpfile_createflag= O_RDWR | O_TRUNC;
+ break;
+ case 's':
+ write_loop= verbose= 0;
+ silent= 1;
+ break;
+ case 't':
+ test_only= 1;
+ /* Avoid to reset 'verbose' if it was already set > 1. */
+ if (! verbose)
+ verbose= 1;
+ break;
+ case 'T':
+ length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
+ if (length != dirname_length(tmp_dir))
+ {
+ tmp_dir[length]=FN_LIBCHAR;
+ tmp_dir[length+1]=0;
+ }
+ break;
+ case 'v':
+ verbose++; /* Allow for selecting the level of verbosity. */
+ silent= 0;
+ break;
+ case '#':
+ DBUG_PUSH(argument ? argument : "d:t:o,/tmp/maria_pack.trace");
+ break;
+ case 'V':
+ print_version();
+ exit(0);
+ case 'I':
+ case '?':
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+ /* reads options */
+ /* Initiates DEBUG - but no debugging here ! */
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ my_progname= argv[0][0];
+ if (isatty(fileno(stdout)))
+ write_loop=1;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (!*argc)
+ {
+ usage();
+ exit(1);
+ }
+ if (join_table)
+ {
+ backup=0; /* Not needed */
+ tmp_dir[0]=0;
+ }
+ return;
+}
+
+
+static MARIA_HA *open_isam_file(char *name,int mode)
+{
+ MARIA_HA *isam_file;
+ MARIA_SHARE *share;
+ DBUG_ENTER("open_isam_file");
+
+ if (!(isam_file=maria_open(name,mode,
+ (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
+ HA_OPEN_ABORT_IF_LOCKED))))
+ {
+ VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
+ DBUG_RETURN(0);
+ }
+ share=isam_file->s;
+ if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
+ {
+ if (!force_pack)
+ {
+ VOID(fprintf(stderr, "%s is already compressed\n", name));
+ VOID(maria_close(isam_file));
+ DBUG_RETURN(0);
+ }
+ if (verbose)
+ puts("Recompressing already compressed table");
+ share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
+ }
+ if (! force_pack && share->state.state.records != 0 &&
+ (share->state.state.records <= 1 ||
+ share->state.state.data_file_length < 1024))
+ {
+ VOID(fprintf(stderr, "%s is too small to compress\n", name));
+ VOID(maria_close(isam_file));
+ DBUG_RETURN(0);
+ }
+ VOID(maria_lock_database(isam_file,F_WRLCK));
+ DBUG_RETURN(isam_file);
+}
+
+
+static bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count)
+{
+ uint i,j;
+ mrg->count=0;
+ mrg->current=0;
+ mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
+ mrg->free_file=1;
+ mrg->src_file_has_indexes_disabled= 0;
+ for (i=0; i < count ; i++)
+ {
+ if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
+ goto error;
+
+ mrg->src_file_has_indexes_disabled|=
+ ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
+ mrg->file[i]->s->base.keys);
+ }
+ /* Check that files are identical */
+ for (j=0 ; j < count-1 ; j++)
+ {
+ MARIA_COLUMNDEF *m1,*m2,*end;
+ if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
+ mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
+ goto diff_file;
+ m1=mrg->file[j]->s->columndef;
+ end=m1+mrg->file[j]->s->base.fields;
+ m2=mrg->file[j+1]->s->columndef;
+ for ( ; m1 != end ; m1++,m2++)
+ {
+ if (m1->type != m2->type || m1->length != m2->length)
+ goto diff_file;
+ }
+ }
+ mrg->count=count;
+ return 0;
+
+ diff_file:
+ VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
+ my_progname, names[j], names[j+1]));
+ error:
+ while (i--)
+ maria_close(mrg->file[i]);
+ my_free((uchar*) mrg->file,MYF(0));
+ return 1;
+}
+
+
+static int compress(PACK_MRG_INFO *mrg,char *result_table)
+{
+ int error;
+ File new_file,join_isam_file;
+ MARIA_HA *isam_file;
+ MARIA_SHARE *share;
+ char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
+ uint i,header_length,fields,trees,used_trees;
+ my_off_t old_length,new_length,tot_elements;
+ HUFF_COUNTS *huff_counts;
+ HUFF_TREE *huff_trees;
+ DBUG_ENTER("compress");
+
+ isam_file=mrg->file[0]; /* Take this as an example */
+ share=isam_file->s;
+ new_file=join_isam_file= -1;
+ trees=fields=0;
+ huff_trees=0;
+ huff_counts=0;
+ maria_block_size= isam_file->s->block_size;
+
+ /* Create temporary or join file */
+ if (backup)
+ VOID(fn_format(org_name,isam_file->s->open_file_name,"",MARIA_NAME_DEXT,
+ 2));
+ else
+ VOID(fn_format(org_name,isam_file->s->open_file_name,"",MARIA_NAME_DEXT,
+ 2+4+16));
+
+ if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
+ maria_block_size) == 0)
+ {
+ fprintf(stderr, "Can't initialize page cache\n");
+ goto err;
+ }
+
+ if (!test_only && result_table)
+ {
+ /* Make a new indexfile based on first file in list */
+ uint length;
+ char *buff;
+ strmov(org_name,result_table); /* Fix error messages */
+ VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2));
+ if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
+ < 0)
+ goto err;
+ length=(uint) share->base.keystart;
+ if (!(buff=my_malloc(length,MYF(MY_WME))))
+ goto err;
+ if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
+ my_write(join_isam_file,buff,length,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ my_free(buff,MYF(0));
+ goto err;
+ }
+ my_free(buff,MYF(0));
+ VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2));
+ }
+ else if (!tmp_dir[0])
+ VOID(make_new_name(new_name,org_name));
+ else
+ VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
+ if (!test_only &&
+ (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
+ goto err;
+
+ /* Start calculating statistics */
+
+ mrg->records=0;
+ for (i=0 ; i < mrg->count ; i++)
+ mrg->records+=mrg->file[i]->s->state.state.records;
+
+ DBUG_PRINT("info", ("Compressing %s: (%lu records)",
+ result_table ? new_name : org_name,
+ (ulong) mrg->records));
+ if (write_loop || verbose)
+ {
+ VOID(printf("Compressing %s: (%lu records)\n",
+ result_table ? new_name : org_name, (ulong) mrg->records));
+ }
+ trees=fields=share->base.fields;
+ huff_counts=init_huff_count(isam_file,mrg->records);
+ QUICK_SAFEMALLOC;
+
+ /*
+ Read the whole data file(s) for statistics.
+ */
+ DBUG_PRINT("info", ("- Calculating statistics"));
+ if (write_loop || verbose)
+ VOID(printf("- Calculating statistics\n"));
+ if (get_statistic(mrg,huff_counts))
+ goto err;
+ NORMAL_SAFEMALLOC;
+ old_length=0;
+ for (i=0; i < mrg->count ; i++)
+ old_length+= (mrg->file[i]->s->state.state.data_file_length -
+ mrg->file[i]->s->state.state.empty);
+
+ /*
+ Create a global priority queue in preparation for making
+ temporary Huffman trees.
+ */
+ if (init_queue(&queue,256,0,0,compare_huff_elements,0))
+ goto err;
+
+ /*
+ Check each column if we should use pre-space-compress, end-space-
+ compress, empty-field-compress or zero-field-compress.
+ */
+ check_counts(huff_counts,fields,mrg->records);
+
+ /*
+ Build a Huffman tree for each column.
+ */
+ huff_trees=make_huff_trees(huff_counts,trees);
+
+ /*
+ If the packed lengths of combined columns is less then the sum of
+ the non-combined columns, then create common Huffman trees for them.
+ We do this only for uchar compressed columns, not for distinct values
+ compressed columns.
+ */
+ if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
+ goto err;
+
+ /*
+ Assign codes to all uchar or column values.
+ */
+ if (make_huff_decode_table(huff_trees,fields))
+ goto err;
+
+ /* Prepare a file buffer. */
+ init_file_buffer(new_file,0);
+
+ /*
+ Reserve space in the target file for the fixed compressed file header.
+ */
+ file_buffer.pos_in_file=HEAD_LENGTH;
+ if (! test_only)
+ VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
+
+ /*
+ Write field infos: field type, pack type, length bits, tree number.
+ */
+ write_field_info(huff_counts,fields,used_trees);
+
+ /*
+ Write decode trees.
+ */
+ if (!(tot_elements=write_huff_tree(huff_trees,trees)))
+ goto err;
+
+ /*
+ Calculate the total length of the compression info header.
+ This includes the fixed compressed file header, the column compression
+ type descriptions, and the decode trees.
+ */
+ header_length=(uint) file_buffer.pos_in_file+
+ (uint) (file_buffer.pos-file_buffer.buffer);
+
+ /*
+ Compress the source file into the target file.
+ */
+ DBUG_PRINT("info", ("- Compressing file"));
+ if (write_loop || verbose)
+ VOID(printf("- Compressing file\n"));
+ error=compress_isam_file(mrg,huff_counts);
+ new_length=file_buffer.pos_in_file;
+ if (!error && !test_only)
+ {
+ char buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
+ bzero(buff,sizeof(buff));
+ error=my_write(file_buffer.file,buff,sizeof(buff),
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+ }
+
+ /*
+ Write the fixed compressed file header.
+ */
+ if (!error)
+ error=write_header(mrg,header_length,used_trees,tot_elements,
+ new_length);
+
+ /* Flush the file buffer. */
+ end_file_buffer();
+
+ /* Display statistics. */
+ DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
+ "Mean total length: %6ld",
+ mrg->min_pack_length, mrg->max_pack_length,
+ (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
+ if (verbose && mrg->records)
+ VOID(printf("Min record length: %6d Max length: %6d "
+ "Mean total length: %6ld\n", mrg->min_pack_length,
+ mrg->max_pack_length, (ulong) (new_length/mrg->records)));
+
+ /* Close source and target file. */
+ if (!test_only)
+ {
+ error|=my_close(new_file,MYF(MY_WME));
+ if (!result_table)
+ {
+ error|=my_close(isam_file->dfile.file, MYF(MY_WME));
+ isam_file->dfile.file= -1; /* Tell maria_close file is closed */
+ isam_file->s->bitmap.file.file= -1;
+ }
+ }
+
+ /* Cleanup. */
+ free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+ if (! test_only && ! error)
+ {
+ if (result_table)
+ {
+ error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
+ }
+ else
+ {
+ if (backup)
+ {
+ if (my_rename(org_name,make_old_name(temp_name,
+ isam_file->s->open_file_name),
+ MYF(MY_WME)))
+ error=1;
+ else
+ {
+ if (tmp_dir[0])
+ error=my_copy(new_name,org_name,MYF(MY_WME));
+ else
+ error=my_rename(new_name,org_name,MYF(MY_WME));
+ if (!error)
+ {
+ VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
+ if (tmp_dir[0])
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ }
+ }
+ }
+ else
+ {
+ if (tmp_dir[0])
+ {
+ error=my_copy(new_name,org_name,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
+ if (!error)
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ }
+ else
+ error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
+ }
+ if (! error)
+ error=save_state(isam_file,mrg,new_length,glob_crc);
+ }
+ }
+ error|=mrg_close(mrg);
+ if (join_isam_file >= 0)
+ error|=my_close(join_isam_file,MYF(MY_WME));
+ if (error)
+ {
+ VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ DBUG_RETURN(-1);
+ }
+ if (write_loop || verbose)
+ {
+ if (old_length)
+ VOID(printf("%.4g%% \n",
+ (((longlong) (old_length - new_length)) * 100.0 /
+ (longlong) old_length)));
+ else
+ puts("Empty file saved in compressed format");
+ }
+ DBUG_RETURN(0);
+
+ err:
+ end_pagecache(maria_pagecache, 1);
+ free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+ if (new_file >= 0)
+ VOID(my_close(new_file,MYF(0)));
+ if (join_isam_file >= 0)
+ VOID(my_close(join_isam_file,MYF(0)));
+ mrg_close(mrg);
+ VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
+ DBUG_RETURN(-1);
+}
+
+ /* Init a huff_count-struct for each field and init it */
+
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
+{
+ reg2 uint i;
+ reg1 HUFF_COUNTS *count;
+ if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
+ sizeof(HUFF_COUNTS),
+ MYF(MY_ZEROFILL | MY_WME))))
+ {
+ for (i=0 ; i < info->s->base.fields ; i++)
+ {
+ enum en_fieldtype type;
+ count[i].field_length=info->s->columndef[i].length;
+ type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type;
+ if (type == FIELD_INTERVALL ||
+ type == FIELD_CONSTANT ||
+ type == FIELD_ZERO)
+ type = FIELD_NORMAL;
+ if (count[i].field_length <= 8 &&
+ (type == FIELD_NORMAL ||
+ type == FIELD_SKIP_ZERO))
+ count[i].max_zero_fill= count[i].field_length;
+ /*
+ For every column initialize a tree, which is used to detect distinct
+ column values. 'int_tree' works together with 'tree_buff' and
+ 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
+ This is accomplished by '-1' as the element size.
+ */
+ init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
+ NULL);
+ if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
+ count[i].tree_pos=count[i].tree_buff =
+ my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
+ MYF(MY_WME));
+ }
+ }
+ return count;
+}
+
+
+ /* Free memory used by counts and trees */
+
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
+ HUFF_COUNTS *huff_counts,
+ uint fields)
+{
+ register uint i;
+
+ if (huff_trees)
+ {
+ for (i=0 ; i < trees ; i++)
+ {
+ if (huff_trees[i].element_buffer)
+ my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
+ if (huff_trees[i].code)
+ my_free((uchar*) huff_trees[i].code,MYF(0));
+ }
+ my_free((uchar*) huff_trees,MYF(0));
+ }
+ if (huff_counts)
+ {
+ for (i=0 ; i < fields ; i++)
+ {
+ if (huff_counts[i].tree_buff)
+ {
+ my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
+ delete_tree(&huff_counts[i].int_tree);
+ }
+ }
+ my_free((uchar*) huff_counts,MYF(0));
+ }
+ delete_queue(&queue); /* This is safe to free */
+ return;
+}
+
+ /* Read through old file and gather some statistics */
+
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
+{
+ int error;
+ uint length, null_bytes;
+ ulong reclength,max_blob_length;
+ uchar *record,*pos,*next_pos,*end_pos,*start_pos;
+ ha_rows record_count;
+ HUFF_COUNTS *count,*end_count;
+ TREE_ELEMENT *element;
+ ha_checksum(*calc_checksum) (struct st_maria_info *, const uchar *);
+ DBUG_ENTER("get_statistic");
+
+ reclength= mrg->file[0]->s->base.reclength;
+ null_bytes= mrg->file[0]->s->base.null_bytes;
+ record=(uchar*) my_alloca(reclength);
+ end_count=huff_counts+mrg->file[0]->s->base.fields;
+ record_count=0; glob_crc=0;
+ max_blob_length=0;
+
+ /* Check how to calculate checksum */
+ if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
+ calc_checksum= _ma_static_checksum;
+ else
+ calc_checksum= _ma_checksum;
+
+ mrg_reset(mrg);
+ while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+ {
+ ulong tot_blob_length=0;
+ if (! error)
+ {
+ /* glob_crc is a checksum over all bytes of all records. */
+ glob_crc+= (*calc_checksum)(mrg->file[0],record);
+
+ /* Count the incidence of values separately for every column. */
+ for (pos=record + null_bytes, count=huff_counts ;
+ count < end_count ;
+ count++,
+ pos=next_pos)
+ {
+ next_pos=end_pos=(start_pos=pos)+count->field_length;
+
+ /*
+ Put the whole column value in a tree if there is room for it.
+ 'int_tree' is used to quickly check for duplicate values.
+ 'tree_buff' collects as many distinct column values as
+ possible. If the field length is > 1, it is tree_buff_length,
+ else 2 bytes. Each value is 'field_length' bytes big. If there
+ are more distinct column values than fit into the buffer, we
+ give up with this tree. BLOBs and VARCHARs do not have a
+ tree_buff as it can only be used with fixed length columns.
+ For the special case of field length == 1, we handle only the
+ case that there is only one distinct value in the table(s).
+ Otherwise, we can have a maximum of 256 distinct values. This
+ is then handled by the normal Huffman tree build.
+
+ Another limit for collecting distinct column values is the
+ number of values itself. Since we would need to build a
+ Huffman tree for the values, we are limited by the 'IS_OFFSET'
+ constant. This constant expresses a bit which is used to
+ determine if a tree element holds a final value or an offset
+ to a child element. Hence, all values and offsets need to be
+ smaller than 'IS_OFFSET'. A tree element is implemented with
+ two integer values, one for the left branch and one for the
+ right branch. For the extreme case that the first element
+ points to the last element, the number of integers in the tree
+ must be less or equal to IS_OFFSET. So the number of elements
+ must be less or equal to IS_OFFSET / 2.
+
+ WARNING: At first, we insert a pointer into the record buffer
+ as the key for the tree. If we got a new distinct value, which
+ is really inserted into the tree, instead of being counted
+ only, we will copy the column value from the record buffer to
+ 'tree_buff' and adjust the key pointer of the tree accordingly.
+ */
+ if (count->tree_buff)
+ {
+ global_count=count;
+ if (!(element=tree_insert(&count->int_tree,pos, 0,
+ count->int_tree.custom_arg)) ||
+ (element->count == 1 &&
+ (count->tree_buff + tree_buff_length <
+ count->tree_pos + count->field_length)) ||
+ (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
+ (count->field_length == 1 &&
+ count->int_tree.elements_in_tree > 1))
+ {
+ delete_tree(&count->int_tree);
+ my_free(count->tree_buff,MYF(0));
+ count->tree_buff=0;
+ }
+ else
+ {
+ /*
+ If tree_insert() succeeds, it either creates a new element
+ or increments the counter of an existing element.
+ */
+ if (element->count == 1)
+ {
+ /* Copy the new column value into 'tree_buff'. */
+ memcpy(count->tree_pos,pos,(size_t) count->field_length);
+ /* Adjust the key pointer in the tree. */
+ tree_set_pointer(element,count->tree_pos);
+ /* Point behind the last column value so far. */
+ count->tree_pos+=count->field_length;
+ }
+ }
+ }
+
+ /* Save character counters and space-counts and zero-field-counts */
+ if (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_ENDSPACE)
+ {
+ /* Ignore trailing space. */
+ for ( ; end_pos > pos ; end_pos--)
+ if (end_pos[-1] != ' ')
+ break;
+ /* Empty fields are just counted. Go to the next record. */
+ if (end_pos == pos)
+ {
+ count->empty_fields++;
+ count->max_zero_fill=0;
+ continue;
+ }
+ /*
+ Count the total of all trailing spaces and the number of
+ short trailing spaces. Remember the longest trailing space.
+ */
+ length= (uint) (next_pos-end_pos);
+ count->tot_end_space+=length;
+ if (length < 8)
+ count->end_space[length]++;
+ if (count->max_end_space < length)
+ count->max_end_space = length;
+ }
+
+ if (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_PRESPACE)
+ {
+ /* Ignore leading space. */
+ for (pos=start_pos; pos < end_pos ; pos++)
+ if (pos[0] != ' ')
+ break;
+ /* Empty fields are just counted. Go to the next record. */
+ if (end_pos == pos)
+ {
+ count->empty_fields++;
+ count->max_zero_fill=0;
+ continue;
+ }
+ /*
+ Count the total of all leading spaces and the number of
+ short leading spaces. Remember the longest leading space.
+ */
+ length= (uint) (pos-start_pos);
+ count->tot_pre_space+=length;
+ if (length < 8)
+ count->pre_space[length]++;
+ if (count->max_pre_space < length)
+ count->max_pre_space = length;
+ }
+
+ /* Calculate pos, end_pos, and max_length for variable length fields. */
+ if (count->field_type == FIELD_BLOB)
+ {
+ uint field_length=count->field_length -portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
+ memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
+ end_pos=pos+blob_length;
+ tot_blob_length+=blob_length;
+ set_if_bigger(count->max_length,blob_length);
+ }
+ else if (count->field_type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+ length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
+ uint2korr(start_pos));
+ pos= start_pos+pack_length;
+ end_pos= pos+length;
+ set_if_bigger(count->max_length,length);
+ }
+
+ /* Evaluate 'max_zero_fill' for short fields. */
+ if (count->field_length <= 8 &&
+ (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_ZERO))
+ {
+ uint i;
+ /* Zero fields are just counted. Go to the next record. */
+ if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
+ {
+ count->zero_fields++;
+ continue;
+ }
+ /*
+ max_zero_fill starts with field_length. It is decreased every
+ time a shorter "zero trailer" is found. It is set to zero when
+ an empty field is found (see above). This suggests that the
+ variable should be called 'min_zero_fill'.
+ */
+ for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
+ i++) ;
+ if (i < count->max_zero_fill)
+ count->max_zero_fill=i;
+ }
+
+ /* Ignore zero fields and check fields. */
+ if (count->field_type == FIELD_ZERO ||
+ count->field_type == FIELD_CHECK)
+ continue;
+
+ /*
+ Count the incidence of every uchar value in the
+ significant field value.
+ */
+ for ( ; pos < end_pos ; pos++)
+ count->counts[(uchar) *pos]++;
+
+ /* Step to next field. */
+ }
+
+ if (tot_blob_length > max_blob_length)
+ max_blob_length=tot_blob_length;
+ record_count++;
+ if (write_loop && record_count % WRITE_COUNT == 0)
+ {
+ VOID(printf("%lu\r", (ulong) record_count));
+ VOID(fflush(stdout));
+ }
+ }
+ else if (error != HA_ERR_RECORD_DELETED)
+ {
+ VOID(fprintf(stderr, "Got error %d while reading rows", error));
+ break;
+ }
+
+ /* Step to next record. */
+ }
+ if (write_loop)
+ {
+ VOID(printf(" \r"));
+ VOID(fflush(stdout));
+ }
+
+ /*
+ If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
+ codes.
+ */
+ DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
+
+ DBUG_PRINT("info", ("Found the following number of incidents "
+ "of the uchar codes:"));
+ if (verbose >= 2)
+ VOID(printf("Found the following number of incidents "
+ "of the uchar codes:\n"));
+ for (count= huff_counts ; count < end_count; count++)
+ {
+ uint idx;
+ my_off_t total_count;
+ char llbuf[32];
+
+ DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
+ if (verbose >= 2)
+ VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
+ if (count->tree_buff)
+ {
+ DBUG_PRINT("info", ("number of distinct values: %u",
+ (uint) ((count->tree_pos - count->tree_buff) /
+ count->field_length)));
+ if (verbose >= 2)
+ VOID(printf("number of distinct values: %u\n",
+ (uint) ((count->tree_pos - count->tree_buff) /
+ count->field_length)));
+ }
+ total_count= 0;
+ for (idx= 0; idx < 256; idx++)
+ {
+ if (count->counts[idx])
+ {
+ total_count+= count->counts[idx];
+ DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
+ llstr((longlong) count->counts[idx], llbuf)));
+ if (verbose >= 2)
+ VOID(printf("counts[0x%02x]: %12s\n", idx,
+ llstr((longlong) count->counts[idx], llbuf)));
+ }
+ }
+ DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
+ llbuf)));
+ if ((verbose >= 2) && total_count)
+ {
+ VOID(printf("total: %12s\n",
+ llstr((longlong) total_count, llbuf)));
+ }
+ }
+
+ mrg->records=record_count;
+ mrg->max_blob_length=max_blob_length;
+ my_afree((uchar*) record);
+ DBUG_RETURN(error != HA_ERR_END_OF_FILE);
+}
+
+static int compare_huff_elements(void *not_used __attribute__((unused)),
+ uchar *a, uchar *b)
+{
+ return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
+ (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
+}
+
+ /* Check each tree if we should use pre-space-compress, end-space-
+ compress, empty-field-compress or zero-field-compress */
+
+static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
+ my_off_t records)
+{
+ uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
+ my_off_t old_length,new_length,length;
+ DBUG_ENTER("check_counts");
+
+ bzero((uchar*) field_count,sizeof(field_count));
+ space_fields=fill_zero_fields=0;
+
+ for (; trees-- ; huff_counts++)
+ {
+ if (huff_counts->field_type == FIELD_BLOB)
+ {
+ huff_counts->length_bits=max_bit(huff_counts->max_length);
+ goto found_pack;
+ }
+ else if (huff_counts->field_type == FIELD_VARCHAR)
+ {
+ huff_counts->length_bits=max_bit(huff_counts->max_length);
+ goto found_pack;
+ }
+ else if (huff_counts->field_type == FIELD_CHECK)
+ {
+ huff_counts->bytes_packed=0;
+ huff_counts->counts[0]=0;
+ goto found_pack;
+ }
+
+ huff_counts->field_type=FIELD_NORMAL;
+ huff_counts->pack_type=0;
+
+ /* Check for zero-filled records (in this column), or zero records. */
+ if (huff_counts->zero_fields || ! records)
+ {
+ my_off_t old_space_count;
+ /*
+ If there are only zero filled records (in this column),
+ or no records at all, we are done.
+ */
+ if (huff_counts->zero_fields == records)
+ {
+ huff_counts->field_type= FIELD_ZERO;
+ huff_counts->bytes_packed=0;
+ huff_counts->counts[0]=0;
+ goto found_pack;
+ }
+ /* Remeber the number of significant spaces. */
+ old_space_count=huff_counts->counts[' '];
+ /* Add all leading and trailing spaces. */
+ huff_counts->counts[' ']+= (huff_counts->tot_end_space +
+ huff_counts->tot_pre_space +
+ huff_counts->empty_fields *
+ huff_counts->field_length);
+ /* Check, what the compressed length of this would be. */
+ old_length=calc_packed_length(huff_counts,0)+records/8;
+ /* Get the number of zero bytes. */
+ length=huff_counts->zero_fields*huff_counts->field_length;
+ /* Add it to the counts. */
+ huff_counts->counts[0]+=length;
+ /* Check, what the compressed length of this would be. */
+ new_length=calc_packed_length(huff_counts,0);
+ /* If the compression without the zeroes would be shorter, we are done. */
+ if (old_length < new_length && huff_counts->field_length > 1)
+ {
+ huff_counts->field_type=FIELD_SKIP_ZERO;
+ huff_counts->counts[0]-=length;
+ huff_counts->bytes_packed=old_length- records/8;
+ goto found_pack;
+ }
+ /* Remove the insignificant spaces, but keep the zeroes. */
+ huff_counts->counts[' ']=old_space_count;
+ }
+ /* Check, what the compressed length of this column would be. */
+ huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+
+ /*
+ If there are enough empty records (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->empty_fields)
+ {
+ if (huff_counts->field_length > 2 &&
+ huff_counts->empty_fields + (records - huff_counts->empty_fields)*
+ (1+max_bit(max(huff_counts->max_pre_space,
+ huff_counts->max_end_space))) <
+ records * max_bit(huff_counts->field_length))
+ {
+ huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
+ }
+ else
+ {
+ length=huff_counts->empty_fields*huff_counts->field_length;
+ if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
+ {
+ huff_counts->tot_end_space+=length;
+ huff_counts->max_end_space=huff_counts->field_length;
+ if (huff_counts->field_length < 8)
+ huff_counts->end_space[huff_counts->field_length]+=
+ huff_counts->empty_fields;
+ }
+ if (huff_counts->tot_pre_space)
+ {
+ huff_counts->tot_pre_space+=length;
+ huff_counts->max_pre_space=huff_counts->field_length;
+ if (huff_counts->field_length < 8)
+ huff_counts->pre_space[huff_counts->field_length]+=
+ huff_counts->empty_fields;
+ }
+ }
+ }
+
+ /*
+ If there are enough trailing spaces (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->tot_end_space)
+ {
+ huff_counts->counts[' ']+=huff_counts->tot_pre_space;
+ if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
+ huff_counts->end_space,
+ huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
+ goto found_pack;
+ huff_counts->counts[' ']-=huff_counts->tot_pre_space;
+ }
+
+ /*
+ If there are enough leading spaces (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->tot_pre_space)
+ {
+ if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
+ huff_counts->pre_space,
+ huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
+ goto found_pack;
+ }
+
+ found_pack: /* Found field-packing */
+
+ /* Test if we can use zero-fill */
+
+ if (huff_counts->max_zero_fill &&
+ (huff_counts->field_type == FIELD_NORMAL ||
+ huff_counts->field_type == FIELD_SKIP_ZERO))
+ {
+ huff_counts->counts[0]-=huff_counts->max_zero_fill*
+ (huff_counts->field_type == FIELD_SKIP_ZERO ?
+ records - huff_counts->zero_fields : records);
+ huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
+ huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+ }
+
+ /* Test if intervall-field is better */
+
+ if (huff_counts->tree_buff)
+ {
+ HUFF_TREE tree;
+
+ DBUG_EXECUTE_IF("forceintervall",
+ huff_counts->bytes_packed= ~ (my_off_t) 0;);
+ tree.element_buffer=0;
+ if (!make_huff_tree(&tree,huff_counts) &&
+ tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
+ {
+ if (tree.elements == 1)
+ huff_counts->field_type=FIELD_CONSTANT;
+ else
+ huff_counts->field_type=FIELD_INTERVALL;
+ huff_counts->pack_type=0;
+ }
+ else
+ {
+ my_free((uchar*) huff_counts->tree_buff,MYF(0));
+ delete_tree(&huff_counts->int_tree);
+ huff_counts->tree_buff=0;
+ }
+ if (tree.element_buffer)
+ my_free((uchar*) tree.element_buffer,MYF(0));
+ }
+ if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
+ space_fields++;
+ if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
+ fill_zero_fields++;
+ field_count[huff_counts->field_type]++;
+ }
+ DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
+ "empty-zero: %3d empty-fill: %3d",
+ field_count[FIELD_NORMAL],space_fields,
+ field_count[FIELD_SKIP_ZERO],fill_zero_fields));
+ DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
+ "intervall-fields: %3d zero: %3d",
+ field_count[FIELD_SKIP_PRESPACE],
+ field_count[FIELD_SKIP_ENDSPACE],
+ field_count[FIELD_INTERVALL],
+ field_count[FIELD_ZERO]));
+ if (verbose)
+ VOID(printf("\nnormal: %3d empty-space: %3d "
+ "empty-zero: %3d empty-fill: %3d\n"
+ "pre-space: %3d end-space: %3d "
+ "intervall-fields: %3d zero: %3d\n",
+ field_count[FIELD_NORMAL],space_fields,
+ field_count[FIELD_SKIP_ZERO],fill_zero_fields,
+ field_count[FIELD_SKIP_PRESPACE],
+ field_count[FIELD_SKIP_ENDSPACE],
+ field_count[FIELD_INTERVALL],
+ field_count[FIELD_ZERO]));
+ DBUG_VOID_RETURN;
+}
+
+
+/* Test if we can use space-compression and empty-field-compression */
+
+static int
+test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
+ uint max_space_length, my_off_t *space_counts,
+ my_off_t tot_space_count, enum en_fieldtype field_type)
+{
+ int min_pos;
+ uint length_bits,i;
+ my_off_t space_count,min_space_count,min_pack,new_length,skip;
+
+ length_bits=max_bit(max_space_length);
+
+ /* Default no end_space-packing */
+ space_count=huff_counts->counts[(uint) ' '];
+ min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
+ min_pack=calc_packed_length(huff_counts,0);
+ min_pos= -2;
+ huff_counts->counts[(uint) ' ']=space_count;
+
+ /* Test with allways space-count */
+ new_length=huff_counts->bytes_packed+length_bits*records/8;
+ if (new_length+1 < min_pack)
+ {
+ min_pos= -1;
+ min_pack=new_length;
+ min_space_count=space_count;
+ }
+ /* Test with length-flag */
+ for (skip=0L, i=0 ; i < 8 ; i++)
+ {
+ if (space_counts[i])
+ {
+ if (i)
+ huff_counts->counts[(uint) ' ']+=space_counts[i];
+ skip+=huff_counts->pre_space[i];
+ new_length=calc_packed_length(huff_counts,0)+
+ (records+(records-skip)*(1+length_bits))/8;
+ if (new_length < min_pack)
+ {
+ min_pos=(int) i;
+ min_pack=new_length;
+ min_space_count=huff_counts->counts[(uint) ' '];
+ }
+ }
+ }
+
+ huff_counts->counts[(uint) ' ']=min_space_count;
+ huff_counts->bytes_packed=min_pack;
+ switch (min_pos) {
+ case -2:
+ return(0); /* No space-compress */
+ case -1: /* Always space-count */
+ huff_counts->field_type=field_type;
+ huff_counts->min_space=0;
+ huff_counts->length_bits=max_bit(max_space_length);
+ break;
+ default:
+ huff_counts->field_type=field_type;
+ huff_counts->min_space=(uint) min_pos;
+ huff_counts->pack_type|=PACK_TYPE_SELECTED;
+ huff_counts->length_bits=max_bit(max_space_length);
+ break;
+ }
+ return(1); /* Using space-compress */
+}
+
+
+ /* Make a huff_tree of each huff_count */
+
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+ uint tree;
+ HUFF_TREE *huff_tree;
+ DBUG_ENTER("make_huff_trees");
+
+ if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
+ MYF(MY_WME | MY_ZEROFILL))))
+ DBUG_RETURN(0);
+
+ for (tree=0 ; tree < trees ; tree++)
+ {
+ if (make_huff_tree(huff_tree+tree,huff_counts+tree))
+ {
+ while (tree--)
+ my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
+ my_free((uchar*) huff_tree,MYF(0));
+ DBUG_RETURN(0);
+ }
+ }
+ DBUG_RETURN(huff_tree);
+}
+
+/*
+ Build a Huffman tree.
+
+ SYNOPSIS
+ make_huff_tree()
+ huff_tree The Huffman tree.
+ huff_counts The counts.
+
+ DESCRIPTION
+ Build a Huffman tree according to huff_counts->counts or
+ huff_counts->tree_buff. tree_buff, if non-NULL contains up to
+ tree_buff_length of distinct column values. In that case, whole
+ values can be Huffman encoded instead of single bytes.
+
+ RETURN
+ 0 OK
+ != 0 Error
+*/
+
+static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
+{
+ uint i,found,bits_packed,first,last;
+ my_off_t bytes_packed;
+ HUFF_ELEMENT *a,*b,*new_huff_el;
+
+ first=last=0;
+ if (huff_counts->tree_buff)
+ {
+ /* Calculate the number of distinct values in tree_buff. */
+ found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
+ huff_counts->field_length;
+ first=0; last=found-1;
+ }
+ else
+ {
+ /* Count the number of uchar codes found in the column. */
+ for (i=found=0 ; i < 256 ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ if (! found++)
+ first=i;
+ last=i;
+ }
+ }
+ if (found < 2)
+ found=2;
+ }
+
+ /* When using 'tree_buff' we can have more that 256 values. */
+ if (queue.max_elements < found)
+ {
+ delete_queue(&queue);
+ if (init_queue(&queue,found,0,0,compare_huff_elements,0))
+ return -1;
+ }
+
+ /* Allocate or reallocate an element buffer for the Huffman tree. */
+ if (!huff_tree->element_buffer)
+ {
+ if (!(huff_tree->element_buffer=
+ (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
+ return 1;
+ }
+ else
+ {
+ HUFF_ELEMENT *temp;
+ if (!(temp=
+ (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
+ found*2*sizeof(HUFF_ELEMENT),
+ MYF(MY_WME))))
+ return 1;
+ huff_tree->element_buffer=temp;
+ }
+
+ huff_counts->tree=huff_tree;
+ huff_tree->counts=huff_counts;
+ huff_tree->min_chr=first;
+ huff_tree->max_chr=last;
+ huff_tree->char_bits=max_bit(last-first);
+ huff_tree->offset_bits=max_bit(found-1)+1;
+
+ if (huff_counts->tree_buff)
+ {
+ huff_tree->elements=0;
+ huff_tree->tree_pack_length=(1+15+16+5+5+
+ (huff_tree->char_bits+1)*found+
+ (huff_tree->offset_bits+1)*
+ (found-2)+7)/8 +
+ (uint) (huff_tree->counts->tree_pos-
+ huff_tree->counts->tree_buff);
+ /*
+ Put a HUFF_ELEMENT into the queue for every distinct column value.
+
+ tree_walk() calls save_counts_in_queue() for every element in
+ 'int_tree'. This takes elements from the target trees element
+ buffer and places references to them into the buffer of the
+ priority queue. We insert in column value order, but the order is
+ in fact irrelevant here. We will establish the correct order
+ later.
+ */
+ tree_walk(&huff_counts->int_tree,
+ (int (*)(void*, element_count,void*)) save_counts_in_queue,
+ (uchar*) huff_tree, left_root_right);
+ }
+ else
+ {
+ huff_tree->elements=found;
+ huff_tree->tree_pack_length=(9+9+5+5+
+ (huff_tree->char_bits+1)*found+
+ (huff_tree->offset_bits+1)*
+ (found-2)+7)/8;
+ /*
+ Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
+
+ The elements are taken from the target trees element buffer.
+ Instead of using queue_insert(), we just place references to the
+ elements into the buffer of the priority queue. We insert in byte
+ value order, but the order is in fact irrelevant here. We will
+ establish the correct order later.
+ */
+ for (i=first, found=0 ; i <= last ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ new_huff_el=huff_tree->element_buffer+(found++);
+ new_huff_el->count=huff_counts->counts[i];
+ new_huff_el->a.leaf.null=0;
+ new_huff_el->a.leaf.element_nr=i;
+ queue.root[found]=(uchar*) new_huff_el;
+ }
+ }
+ /*
+ If there is only a single uchar value in this field in all records,
+ add a second element with zero incidence. This is required to enter
+ the loop, which builds the Huffman tree.
+ */
+ while (found < 2)
+ {
+ new_huff_el=huff_tree->element_buffer+(found++);
+ new_huff_el->count=0;
+ new_huff_el->a.leaf.null=0;
+ if (last)
+ new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
+ else
+ new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
+ queue.root[found]=(uchar*) new_huff_el;
+ }
+ }
+
+ /* Make a queue from the queue buffer. */
+ queue.elements=found;
+
+ /*
+ Make a priority queue from the queue. Construct its index so that we
+ have a partially ordered tree.
+ */
+ for (i=found/2 ; i > 0 ; i--)
+ _downheap(&queue,i);
+
+ /* The Huffman algorithm. */
+ bytes_packed=0; bits_packed=0;
+ for (i=1 ; i < found ; i++)
+ {
+ /*
+ Pop the top element from the queue (the one with the least incidence).
+ Popping from a priority queue includes a re-ordering of the queue,
+ to get the next least incidence element to the top.
+ */
+ a=(HUFF_ELEMENT*) queue_remove(&queue,0);
+ /*
+ Copy the next least incidence element. The queue implementation
+ reserves root[0] for temporary purposes. root[1] is the top.
+ */
+ b=(HUFF_ELEMENT*) queue.root[1];
+ /* Get a new element from the element buffer. */
+ new_huff_el=huff_tree->element_buffer+found+i;
+ /* The new element gets the sum of the two least incidence elements. */
+ new_huff_el->count=a->count+b->count;
+ /*
+ The Huffman algorithm assigns another bit to the code for a byte
+ every time that bytes incidence is combined (directly or indirectly)
+ to a new element as one of the two least incidence elements.
+ This means that one more bit per incidence of that uchar is required
+ in the resulting file. So we add the new combined incidence as the
+ number of bits by which the result grows.
+ */
+ bits_packed+=(uint) (new_huff_el->count & 7);
+ bytes_packed+=new_huff_el->count/8;
+ /* The new element points to its children, lesser in left. */
+ new_huff_el->a.nod.left=a;
+ new_huff_el->a.nod.right=b;
+ /*
+ Replace the copied top element by the new element and re-order the
+ queue.
+ */
+ queue.root[1]=(uchar*) new_huff_el;
+ queue_replaced(&queue);
+ }
+ huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
+ huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
+ return 0;
+}
+
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+ register const uchar *s, register const uchar *t)
+{
+ uint length;
+ for (length=global_count->field_length; length-- ;)
+ if (*s++ != *t++)
+ return (int) s[-1] - (int) t[-1];
+ return 0;
+}
+
+/*
+ Organize distinct column values and their incidences into a priority queue.
+
+ SYNOPSIS
+ save_counts_in_queue()
+ key The column value.
+ count The incidence of this value.
+ tree The Huffman tree to be built later.
+
+ DESCRIPTION
+ We use the element buffer of the targeted tree. The distinct column
+ values are organized in a priority queue first. The Huffman
+ algorithm will later organize the elements into a Huffman tree. For
+ the time being, we just place references to the elements into the
+ queue buffer. The buffer will later be organized into a priority
+ queue.
+
+ RETURN
+ 0
+ */
+
+static int save_counts_in_queue(uchar *key, element_count count,
+ HUFF_TREE *tree)
+{
+ HUFF_ELEMENT *new_huff_el;
+
+ new_huff_el=tree->element_buffer+(tree->elements++);
+ new_huff_el->count=count;
+ new_huff_el->a.leaf.null=0;
+ new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
+ tree->counts->field_length;
+ queue.root[tree->elements]=(uchar*) new_huff_el;
+ return 0;
+}
+
+
+/*
+ Calculate length of file if given counts should be used.
+
+ SYNOPSIS
+ calc_packed_length()
+ huff_counts The counts for a column of the table(s).
+ add_tree_lenght If the decode tree length should be added.
+
+ DESCRIPTION
+ We need to follow the Huffman algorithm until we know, how many bits
+ are required for each uchar code. But we do not need the resulting
+ Huffman tree. Hence, we can leave out some steps which are essential
+ in make_huff_tree().
+
+ RETURN
+ Number of bytes required to compress this table column.
+*/
+
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
+ uint add_tree_lenght)
+{
+ uint i,found,bits_packed,first,last;
+ my_off_t bytes_packed;
+ HUFF_ELEMENT element_buffer[256];
+ DBUG_ENTER("calc_packed_length");
+
+ /*
+ WARNING: We use a small hack for efficiency: Instead of placing
+ references to HUFF_ELEMENTs into the queue, we just insert
+ references to the counts of the uchar codes which appeared in this
+ table column. During the Huffman algorithm they are successively
+ replaced by references to HUFF_ELEMENTs. This works, because
+ HUFF_ELEMENTs have the incidence count at their beginning.
+ Regardless, wether the queue array contains references to counts of
+ type my_off_t or references to HUFF_ELEMENTs which have the count of
+ type my_off_t at their beginning, it always points to a count of the
+ same type.
+
+ Instead of using queue_insert(), we just copy the references into
+ the buffer of the priority queue. We insert in uchar value order, but
+ the order is in fact irrelevant here. We will establish the correct
+ order later.
+ */
+ first=last=0;
+ for (i=found=0 ; i < 256 ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ if (! found++)
+ first=i;
+ last=i;
+ /* We start with root[1], which is the queues top element. */
+ queue.root[found]=(uchar*) &huff_counts->counts[i];
+ }
+ }
+ if (!found)
+ DBUG_RETURN(0); /* Empty tree */
+ /*
+ If there is only a single uchar value in this field in all records,
+ add a second element with zero incidence. This is required to enter
+ the loop, which follows the Huffman algorithm.
+ */
+ if (found < 2)
+ queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
+
+ /* Make a queue from the queue buffer. */
+ queue.elements=found;
+
+ bytes_packed=0; bits_packed=0;
+ /* Add the length of the coding table, which would become part of the file. */
+ if (add_tree_lenght)
+ bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
+ (max_bit(found-1)+1+1)*(found-2) +7)/8;
+
+ /*
+ Make a priority queue from the queue. Construct its index so that we
+ have a partially ordered tree.
+ */
+ for (i=(found+1)/2 ; i > 0 ; i--)
+ _downheap(&queue,i);
+
+ /* The Huffman algorithm. */
+ for (i=0 ; i < found-1 ; i++)
+ {
+ my_off_t *a;
+ my_off_t *b;
+ HUFF_ELEMENT *new_huff_el;
+
+ /*
+ Pop the top element from the queue (the one with the least
+ incidence). Popping from a priority queue includes a re-ordering
+ of the queue, to get the next least incidence element to the top.
+ */
+ a= (my_off_t*) queue_remove(&queue, 0);
+ /*
+ Copy the next least incidence element. The queue implementation
+ reserves root[0] for temporary purposes. root[1] is the top.
+ */
+ b= (my_off_t*) queue.root[1];
+ /* Create a new element in a local (automatic) buffer. */
+ new_huff_el= element_buffer + i;
+ /* The new element gets the sum of the two least incidence elements. */
+ new_huff_el->count= *a + *b;
+ /*
+ The Huffman algorithm assigns another bit to the code for a byte
+ every time that bytes incidence is combined (directly or indirectly)
+ to a new element as one of the two least incidence elements.
+ This means that one more bit per incidence of that uchar is required
+ in the resulting file. So we add the new combined incidence as the
+ number of bits by which the result grows.
+ */
+ bits_packed+=(uint) (new_huff_el->count & 7);
+ bytes_packed+=new_huff_el->count/8;
+ /*
+ Replace the copied top element by the new element and re-order the
+ queue. This successively replaces the references to counts by
+ references to HUFF_ELEMENTs.
+ */
+ queue.root[1]=(uchar*) new_huff_el;
+ queue_replaced(&queue);
+ }
+ DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
+}
+
+
+ /* Remove trees that don't give any compression */
+
+static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+ uint k,tree_number;
+ HUFF_COUNTS count,*i,*j,*last_count;
+
+ last_count=huff_counts+trees;
+ for (tree_number=0, i=huff_counts ; i < last_count ; i++)
+ {
+ if (!i->tree->tree_number)
+ {
+ i->tree->tree_number= ++tree_number;
+ if (i->tree_buff)
+ continue; /* Don't join intervall */
+ for (j=i+1 ; j < last_count ; j++)
+ {
+ if (! j->tree->tree_number && ! j->tree_buff)
+ {
+ for (k=0 ; k < 256 ; k++)
+ count.counts[k]=i->counts[k]+j->counts[k];
+ if (calc_packed_length(&count,1) <=
+ i->tree->bytes_packed + j->tree->bytes_packed+
+ i->tree->tree_pack_length+j->tree->tree_pack_length+
+ ALLOWED_JOIN_DIFF)
+ {
+ memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
+ sizeof(count.counts[0])*256);
+ my_free((uchar*) j->tree->element_buffer,MYF(0));
+ j->tree->element_buffer=0;
+ j->tree=i->tree;
+ bmove((uchar*) i->counts,(uchar*) count.counts,
+ sizeof(count.counts[0])*256);
+ if (make_huff_tree(i->tree,i))
+ return (uint) -1;
+ }
+ }
+ }
+ }
+ }
+ DBUG_PRINT("info", ("Original trees: %d After join: %d",
+ trees, tree_number));
+ if (verbose)
+ VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
+ return tree_number; /* Return trees left */
+}
+
+
+/*
+ Fill in huff_tree encode tables.
+
+ SYNOPSIS
+ make_huff_decode_table()
+ huff_tree An array of HUFF_TREE which are to be encoded.
+ trees The number of HUFF_TREE in the array.
+
+ RETURN
+ 0 success
+ != 0 error
+*/
+
+static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
+{
+ uint elements;
+ for ( ; trees-- ; huff_tree++)
+ {
+ if (huff_tree->tree_number > 0)
+ {
+ elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+ if (!(huff_tree->code =
+ (ulonglong*) my_malloc(elements*
+ (sizeof(ulonglong) + sizeof(uchar)),
+ MYF(MY_WME | MY_ZEROFILL))))
+ return 1;
+ huff_tree->code_len=(uchar*) (huff_tree->code+elements);
+ make_traverse_code_tree(huff_tree, huff_tree->root,
+ 8 * sizeof(ulonglong), LL(0));
+ }
+ }
+ return 0;
+}
+
+
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,
+ uint size, ulonglong code)
+{
+ uint chr;
+ if (!element->a.leaf.null)
+ {
+ chr=element->a.leaf.element_nr;
+ huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
+ huff_tree->code[chr]= (code >> size);
+ if (huff_tree->height < 8 * sizeof(ulonglong) - size)
+ huff_tree->height= 8 * sizeof(ulonglong) - size;
+ }
+ else
+ {
+ size--;
+ make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
+ make_traverse_code_tree(huff_tree, element->a.nod.right, size,
+ code + (((ulonglong) 1) << size));
+ }
+ return;
+}
+
+
+/*
+ Convert a value into binary digits.
+
+ SYNOPSIS
+ bindigits()
+ value The value.
+ length The number of low order bits to convert.
+
+ NOTE
+ The result string is in static storage. It is reused on every call.
+ So you cannot use it twice in one expression.
+
+ RETURN
+ A pointer to a static NUL-terminated string.
+ */
+
+static char *bindigits(ulonglong value, uint bits)
+{
+ static char digits[72];
+ char *ptr= digits;
+ uint idx= bits;
+
+ DBUG_ASSERT(idx < sizeof(digits));
+ while (idx)
+ *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
+ *ptr= '\0';
+ return digits;
+}
+
+
+/*
+ Convert a value into hexadecimal digits.
+
+ SYNOPSIS
+ hexdigits()
+ value The value.
+
+ NOTE
+ The result string is in static storage. It is reused on every call.
+ So you cannot use it twice in one expression.
+
+ RETURN
+ A pointer to a static NUL-terminated string.
+ */
+
+static char *hexdigits(ulonglong value)
+{
+ static char digits[20];
+ char *ptr= digits;
+ uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
+
+ DBUG_ASSERT(idx < sizeof(digits));
+ while (idx)
+ {
+ if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
+ *(ptr - 1)+= 'a' - '9' - 1;
+ }
+ *ptr= '\0';
+ return digits;
+}
+
+
+ /* Write header to new packed data file */
+
+static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
+ my_off_t tot_elements,my_off_t filelength)
+{
+ uchar *buff= (uchar*) file_buffer.pos;
+
+ bzero(buff,HEAD_LENGTH);
+ memcpy_fixed(buff,maria_pack_file_magic,4);
+ int4store(buff+4,head_length);
+ int4store(buff+8, mrg->min_pack_length);
+ int4store(buff+12,mrg->max_pack_length);
+ int4store(buff+16,tot_elements);
+ int4store(buff+20,intervall_length);
+ int2store(buff+24,trees);
+ buff[26]=(char) mrg->ref_length;
+ /* Save record pointer length */
+ buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
+ if (test_only)
+ return 0;
+ VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
+ return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+}
+
+ /* Write fieldinfo to new packed file */
+
+static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
+{
+ reg1 uint i;
+ uint huff_tree_bits;
+ huff_tree_bits=max_bit(trees ? trees-1 : 0);
+
+ DBUG_PRINT("info", (" "));
+ DBUG_PRINT("info", ("column types:"));
+ DBUG_PRINT("info", ("FIELD_NORMAL 0"));
+ DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
+ DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
+ DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
+ DBUG_PRINT("info", ("FIELD_BLOB 4"));
+ DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
+ DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
+ DBUG_PRINT("info", ("FIELD_ZERO 7"));
+ DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
+ DBUG_PRINT("info", ("FIELD_CHECK 9"));
+ DBUG_PRINT("info", (" "));
+ DBUG_PRINT("info", ("pack type as a set of flags:"));
+ DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
+ DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
+ DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ {
+ VOID(printf("\n"));
+ VOID(printf("column types:\n"));
+ VOID(printf("FIELD_NORMAL 0\n"));
+ VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
+ VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
+ VOID(printf("FIELD_SKIP_ZERO 3\n"));
+ VOID(printf("FIELD_BLOB 4\n"));
+ VOID(printf("FIELD_CONSTANT 5\n"));
+ VOID(printf("FIELD_INTERVALL 6\n"));
+ VOID(printf("FIELD_ZERO 7\n"));
+ VOID(printf("FIELD_VARCHAR 8\n"));
+ VOID(printf("FIELD_CHECK 9\n"));
+ VOID(printf("\n"));
+ VOID(printf("pack type as a set of flags:\n"));
+ VOID(printf("PACK_TYPE_SELECTED 1\n"));
+ VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
+ VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
+ VOID(printf("\n"));
+ }
+ for (i=0 ; i++ < fields ; counts++)
+ {
+ write_bits((ulonglong) (int) counts->field_type, 5);
+ write_bits(counts->pack_type,6);
+ if (counts->pack_type & PACK_TYPE_ZERO_FILL)
+ write_bits(counts->max_zero_fill,5);
+ else
+ write_bits(counts->length_bits,5);
+ write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
+ DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
+ "lbits: %2u tree: %2u length: %4u",
+ i , counts->field_type, counts->pack_type,
+ counts->max_zero_fill, counts->length_bits,
+ counts->tree->tree_number, counts->field_length));
+ if (verbose >= 2)
+ VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
+ "tree: %2u length: %4u\n", i , counts->field_type,
+ counts->pack_type, counts->max_zero_fill, counts->length_bits,
+ counts->tree->tree_number, counts->field_length));
+ }
+ flush_bits();
+ return;
+}
+
+ /* Write all huff_trees to new datafile. Return tot count of
+ elements in all trees
+ Returns 0 on error */
+
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
+{
+ uint i,int_length;
+ uint tree_no;
+ uint codes;
+ uint errors= 0;
+ uint *packed_tree,*offset,length;
+ my_off_t elements;
+
+ /* Find the highest number of elements in the trees. */
+ for (i=length=0 ; i < trees ; i++)
+ if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
+ length=huff_tree[i].elements;
+ /*
+ Allocate a buffer for packing a decode tree. Two numbers per element
+ (left child and right child).
+ */
+ if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
+ {
+ my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
+ return 0;
+ }
+
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ VOID(printf("\n"));
+ tree_no= 0;
+ intervall_length=0;
+ for (elements=0; trees-- ; huff_tree++)
+ {
+ /* Skip columns that have been joined with other columns. */
+ if (huff_tree->tree_number == 0)
+ continue; /* Deleted tree */
+ tree_no++;
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 3)
+ VOID(printf("\n"));
+ /* Count the total number of elements (byte codes or column values). */
+ elements+=huff_tree->elements;
+ huff_tree->max_offset=2;
+ /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
+ if (huff_tree->elements <= 1)
+ offset=packed_tree;
+ else
+ offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
+
+ /* This should be the same as 'length' above. */
+ huff_tree->offset_bits=max_bit(huff_tree->max_offset);
+
+ /*
+ Since we check this during collecting the distinct column values,
+ this should never happen.
+ */
+ if (huff_tree->max_offset >= IS_OFFSET)
+ { /* This should be impossible */
+ VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
+ huff_tree->max_offset));
+ my_afree((uchar*) packed_tree);
+ return 0;
+ }
+
+ DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
+ "char_bits: %u\n",
+ (ulong) (file_buffer.pos - file_buffer.buffer),
+ huff_tree->elements, (ulong) (offset - packed_tree),
+ huff_tree->char_bits));
+ if (!huff_tree->counts->tree_buff)
+ {
+ /* We do a uchar compression on this column. Mark with bit 0. */
+ write_bits(0,1);
+ write_bits(huff_tree->min_chr,8);
+ write_bits(huff_tree->elements,9);
+ write_bits(huff_tree->char_bits,5);
+ write_bits(huff_tree->offset_bits,5);
+ int_length=0;
+ }
+ else
+ {
+ int_length=(uint) (huff_tree->counts->tree_pos -
+ huff_tree->counts->tree_buff);
+ /* We have distinct column values for this column. Mark with bit 1. */
+ write_bits(1,1);
+ write_bits(huff_tree->elements,15);
+ write_bits(int_length,16);
+ write_bits(huff_tree->char_bits,5);
+ write_bits(huff_tree->offset_bits,5);
+ intervall_length+=int_length;
+ }
+ DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
+ "offset_bits: %2u %s: %5u codelen: %2u",
+ tree_no, huff_tree->elements, huff_tree->char_bits,
+ huff_tree->offset_bits, huff_tree->counts->tree_buff ?
+ "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
+ int_length : huff_tree->min_chr, huff_tree->height));
+ if (verbose >= 2)
+ VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
+ "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
+ huff_tree->char_bits, huff_tree->offset_bits,
+ huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
+ huff_tree->counts->tree_buff ? int_length :
+ huff_tree->min_chr, huff_tree->height));
+
+ /* Check that the code tree length matches the element count. */
+ length=(uint) (offset-packed_tree);
+ if (length != huff_tree->elements*2-2)
+ {
+ VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
+ length, huff_tree->elements * 2 - 2));
+ errors++;
+ break;
+ }
+
+ for (i=0 ; i < length ; i++)
+ {
+ if (packed_tree[i] & IS_OFFSET)
+ write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
+ huff_tree->offset_bits+1);
+ else
+ write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
+ DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
+ i, (packed_tree[i] & IS_OFFSET) ?
+ " -> " : "", (packed_tree[i] & IS_OFFSET) ?
+ packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+ if (verbose >= 3)
+ VOID(printf("tree[0x%04x]: %s0x%04x\n",
+ i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
+ (packed_tree[i] & IS_OFFSET) ?
+ packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+ }
+ flush_bits();
+
+ /*
+ Display coding tables and check their correctness.
+ */
+ codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+ for (i= 0; i < codes; i++)
+ {
+ ulonglong code;
+ uint bits;
+ uint len;
+ uint idx;
+
+ if (! (len= huff_tree->code_len[i]))
+ continue;
+ DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+ bindigits(huff_tree->code[i],
+ huff_tree->code_len[i])));
+ if (verbose >= 3)
+ VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+ bindigits(huff_tree->code[i], huff_tree->code_len[i])));
+
+ /* Check that the encode table decodes correctly. */
+ code= 0;
+ bits= 0;
+ idx= 0;
+ DBUG_EXECUTE_IF("forcechkerr1", len--;);
+ DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
+ DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
+ for (;;)
+ {
+ if (! len)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
+ errors++;
+ break;
+ }
+ code<<= 1;
+ code|= (huff_tree->code[i] >> (--len)) & 1;
+ bits++;
+ if (bits > 8 * sizeof(code))
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
+ bits, (uint) (8 * sizeof(code))));
+ errors++;
+ break;
+ }
+ idx+= (uint) code & 1;
+ if (idx >= length)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
+ idx, length));
+ errors++;
+ break;
+ }
+ if (packed_tree[idx] & IS_OFFSET)
+ idx+= packed_tree[idx] & ~IS_OFFSET;
+ else
+ break; /* Hit a leaf. This contains the result value. */
+ }
+ if (errors)
+ break;
+
+ DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
+ if (packed_tree[idx] != i)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
+ packed_tree[idx], i));
+ errors++;
+ break;
+ }
+ } /*end for (codes)*/
+ if (errors)
+ break;
+
+ /* Write column values in case of distinct column value compression. */
+ if (huff_tree->counts->tree_buff)
+ {
+ for (i=0 ; i < int_length ; i++)
+ {
+ write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
+ DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
+ i, (uchar) huff_tree->counts->tree_buff[i]));
+ if (verbose >= 3)
+ VOID(printf("column_values[0x%04x]: 0x%02x\n",
+ i, (uchar) huff_tree->counts->tree_buff[i]));
+ }
+ }
+ flush_bits();
+ }
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ VOID(printf("\n"));
+ my_afree((uchar*) packed_tree);
+ if (errors)
+ {
+ VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
+ return 0;
+ }
+ return elements;
+}
+
+
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
+ uint *offset)
+{
+ uint *prev_offset;
+
+ prev_offset= offset;
+ /*
+ 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
+ then there is no left child and, hence no right child either. This
+ is a property of a binary tree. An element is either a node with two
+ childs, or a leaf without childs.
+
+ The current element is always a node with two childs. Go left first.
+ */
+ if (!element->a.nod.left->a.leaf.null)
+ {
+ /* Store the uchar code or the index of the column value. */
+ prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
+ offset+=2;
+ }
+ else
+ {
+ /*
+ Recursively traverse the tree to the left. Mark it as an offset to
+ another tree node (in contrast to a uchar code or column value index).
+ */
+ prev_offset[0]= IS_OFFSET+2;
+ offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
+ }
+
+ /* Now, check the right child. */
+ if (!element->a.nod.right->a.leaf.null)
+ {
+ /* Store the uchar code or the index of the column value. */
+ prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
+ return offset;
+ }
+ else
+ {
+ /*
+ Recursively traverse the tree to the right. Mark it as an offset to
+ another tree node (in contrast to a uchar code or column value index).
+ */
+ uint temp=(uint) (offset-prev_offset-1);
+ prev_offset[1]= IS_OFFSET+ temp;
+ if (huff_tree->max_offset < temp)
+ huff_tree->max_offset = temp;
+ return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
+ }
+}
+
+ /* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+ reg2 uint power=1;
+
+ while ((value>>=1))
+ power++;
+ return (power);
+}
+
+
+static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
+{
+ int error;
+ uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
+ uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
+ my_off_t record_count;
+ char llbuf[32];
+ ulong length,pack_length;
+ uchar *record,*pos,*end_pos,*record_pos,*start_pos;
+ HUFF_COUNTS *count,*end_count;
+ HUFF_TREE *tree;
+ MARIA_HA *isam_file=mrg->file[0];
+ uint pack_version= (uint) isam_file->s->pack.version;
+ DBUG_ENTER("compress_isam_file");
+
+ /* Allocate a buffer for the records (excluding blobs). */
+ if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
+ return -1;
+
+ end_count=huff_counts+isam_file->s->base.fields;
+ min_record_length= (uint) ~0;
+ max_record_length=0;
+ null_bytes= isam_file->s->base.null_bytes;
+
+ /*
+ Calculate the maximum number of bits required to pack the records.
+ Remember to understand 'max_zero_fill' as 'min_zero_fill'.
+ The tree height determines the maximum number of bits per value.
+ Some fields skip leading or trailing spaces or zeroes. The skipped
+ number of bytes is encoded by 'length_bits' bits.
+ Empty blobs and varchar are encoded with a single 1 bit. Other blobs
+ and varchar get a leading 0 bit.
+ */
+ max_calc_length= null_bytes;
+ for (i= 0 ; i < isam_file->s->base.fields ; i++)
+ {
+ if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
+ huff_counts[i].max_zero_fill=0;
+ if (huff_counts[i].field_type == FIELD_CONSTANT ||
+ huff_counts[i].field_type == FIELD_ZERO ||
+ huff_counts[i].field_type == FIELD_CHECK)
+ continue;
+ if (huff_counts[i].field_type == FIELD_INTERVALL)
+ max_calc_length+=huff_counts[i].tree->height;
+ else if (huff_counts[i].field_type == FIELD_BLOB ||
+ huff_counts[i].field_type == FIELD_VARCHAR)
+ max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
+ else
+ max_calc_length+=
+ (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
+ huff_counts[i].tree->height+huff_counts[i].length_bits;
+ }
+ max_calc_length= (max_calc_length + 7) / 8;
+ pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
+ record_count=0;
+ /* 'max_blob_length' is the max length of all blobs of a record. */
+ pack_blob_length= isam_file->s->base.blobs ?
+ _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
+ max_pack_length=pack_ref_length+pack_blob_length;
+
+ DBUG_PRINT("fields", ("==="));
+ mrg_reset(mrg);
+ while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+ {
+ ulong tot_blob_length=0;
+ if (! error)
+ {
+ if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
+ break;
+ record_pos= (uchar*) file_buffer.pos;
+ file_buffer.pos+= max_pack_length;
+ if (null_bytes)
+ {
+ /* Copy null bits 'as is' */
+ memcpy(file_buffer.pos, record, null_bytes);
+ file_buffer.pos+= null_bytes;
+ }
+ for (start_pos=record+null_bytes, count= huff_counts;
+ count < end_count ;
+ count++)
+ {
+ end_pos=start_pos+(field_length=count->field_length);
+ tree=count->tree;
+
+ DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
+ "lbits: %2u tree: %2u length: %4u",
+ (ulong) (count - huff_counts + 1),
+ count->field_type,
+ count->pack_type, count->max_zero_fill,
+ count->length_bits, count->tree->tree_number,
+ count->field_length));
+
+ /* Check if the column contains spaces only. */
+ if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
+ if (pos == end_pos)
+ {
+ DBUG_PRINT("fields",
+ ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
+ DBUG_PRINT("fields", ("---"));
+ write_bits(1,1);
+ start_pos=end_pos;
+ continue;
+ }
+ DBUG_PRINT("fields",
+ ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
+ write_bits(0,1);
+ }
+ end_pos-=count->max_zero_fill;
+ field_length-=count->max_zero_fill;
+
+ switch (count->field_type) {
+ case FIELD_SKIP_ZERO:
+ if (!memcmp((uchar*) start_pos,zero_string,field_length))
+ {
+ DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
+ write_bits(1,1);
+ start_pos=end_pos;
+ break;
+ }
+ DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
+ write_bits(0,1);
+ /* Fall through */
+ case FIELD_NORMAL:
+ DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
+ (ulong) (end_pos - start_pos)));
+ for ( ; start_pos < end_pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ break;
+ case FIELD_SKIP_ENDSPACE:
+ for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
+ length= (ulong) (end_pos - pos);
+ if (count->pack_type & PACK_TYPE_SELECTED)
+ {
+ if (length > count->min_space)
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(1,1);
+ write_bits(length,count->length_bits);
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE not more than min_space, "
+ "bits: 1"));
+ write_bits(0,1);
+ pos=end_pos;
+ }
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(length,count->length_bits);
+ }
+ /* Encode all significant bytes. */
+ DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
+ (ulong) (pos - start_pos)));
+ for ( ; start_pos < pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ start_pos=end_pos;
+ break;
+ case FIELD_SKIP_PRESPACE:
+ for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
+ length= (ulong) (pos - start_pos);
+ if (count->pack_type & PACK_TYPE_SELECTED)
+ {
+ if (length > count->min_space)
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(1,1);
+ write_bits(length,count->length_bits);
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE not more than min_space, "
+ "bits: 1"));
+ pos=start_pos;
+ write_bits(0,1);
+ }
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(length,count->length_bits);
+ }
+ /* Encode all significant bytes. */
+ DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
+ (ulong) (end_pos - start_pos)));
+ for (start_pos=pos ; start_pos < end_pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ break;
+ case FIELD_CONSTANT:
+ case FIELD_ZERO:
+ case FIELD_CHECK:
+ DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
+ start_pos=end_pos;
+ break;
+ case FIELD_INTERVALL:
+ global_count=count;
+ pos=(uchar*) tree_search(&count->int_tree, start_pos,
+ count->int_tree.custom_arg);
+ intervall=(uint) (pos - count->tree_buff)/field_length;
+ DBUG_PRINT("fields", ("FIELD_INTERVALL"));
+ DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
+ intervall, hexdigits(tree->code[intervall]),
+ (uint) tree->code_len[intervall]));
+ write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
+ start_pos=end_pos;
+ break;
+ case FIELD_BLOB:
+ {
+ ulong blob_length= _ma_calc_blob_length(field_length-
+ portable_sizeof_char_ptr,
+ start_pos);
+ /* Empty blobs are encoded with a single 1 bit. */
+ if (!blob_length)
+ {
+ DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
+ write_bits(1,1);
+ }
+ else
+ {
+ uchar *blob,*blob_end;
+ DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
+ write_bits(0,1);
+ /* Write the blob length. */
+ DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
+ blob_length, count->length_bits));
+ write_bits(blob_length,count->length_bits);
+ memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
+ sizeof(char*));
+ blob_end=blob+blob_length;
+ /* Encode the blob bytes. */
+ for ( ; blob < blob_end ; blob++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
+ (uint) tree->code_len[(uchar) *blob],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint)tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *blob],
+ (uint) tree->code_len[(uchar) *blob]);
+ }
+ tot_blob_length+=blob_length;
+ }
+ start_pos= end_pos;
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+ ulong col_length= (var_pack_length == 1 ?
+ (uint) *(uchar*) start_pos :
+ uint2korr(start_pos));
+ /* Empty varchar are encoded with a single 1 bit. */
+ if (!col_length)
+ {
+ DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
+ write_bits(1,1); /* Empty varchar */
+ }
+ else
+ {
+ uchar *end= start_pos + var_pack_length + col_length;
+ DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
+ write_bits(0,1);
+ /* Write the varchar length. */
+ DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
+ col_length, count->length_bits));
+ write_bits(col_length,count->length_bits);
+ /* Encode the varchar bytes. */
+ for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint)tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ }
+ start_pos= end_pos;
+ break;
+ }
+ case FIELD_LAST:
+ case FIELD_enum_val_count:
+ abort(); /* Impossible */
+ }
+ start_pos+=count->max_zero_fill;
+ DBUG_PRINT("fields", ("---"));
+ }
+ flush_bits();
+ length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
+ pack_length= _ma_save_pack_length(pack_version, record_pos, length);
+ if (pack_blob_length)
+ pack_length+= _ma_save_pack_length(pack_version,
+ record_pos + pack_length,
+ tot_blob_length);
+ DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
+ "length-bytes: %lu", (ulong) record_count, length,
+ tot_blob_length, pack_length));
+ DBUG_PRINT("fields", ("==="));
+
+ /* Correct file buffer if the header was smaller */
+ if (pack_length != max_pack_length)
+ {
+ bmove(record_pos+pack_length,record_pos+max_pack_length,length);
+ file_buffer.pos-= (max_pack_length-pack_length);
+ }
+ if (length < (ulong) min_record_length)
+ min_record_length=(uint) length;
+ if (length > (ulong) max_record_length)
+ max_record_length=(uint) length;
+ record_count++;
+ if (write_loop && record_count % WRITE_COUNT == 0)
+ {
+ VOID(printf("%lu\r", (ulong) record_count));
+ VOID(fflush(stdout));
+ }
+ }
+ else if (error != HA_ERR_RECORD_DELETED)
+ break;
+ }
+ if (error == HA_ERR_END_OF_FILE)
+ error=0;
+ else
+ {
+ VOID(fprintf(stderr, "%s: Got error %d reading records\n",
+ my_progname, error));
+ }
+ if (verbose >= 2)
+ VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
+
+ my_afree((uchar*) record);
+ mrg->ref_length=max_pack_length;
+ mrg->min_pack_length=max_record_length ? min_record_length : 0;
+ mrg->max_pack_length=max_record_length;
+ DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
+}
+
+
+static char *make_new_name(char *new_name, char *old_name)
+{
+ return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
+}
+
+static char *make_old_name(char *new_name, char *old_name)
+{
+ return fn_format(new_name,old_name,"",OLD_EXT,2+4);
+}
+
+ /* rutines for bit writing buffer */
+
+static void init_file_buffer(File file, pbool read_buffer)
+{
+ file_buffer.file=file;
+ file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
+ MYF(MY_WME));
+ file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
+ file_buffer.pos_in_file=0;
+ error_on_write=0;
+ if (read_buffer)
+ {
+
+ file_buffer.pos=file_buffer.end;
+ file_buffer.bits=0;
+ }
+ else
+ {
+ file_buffer.pos=file_buffer.buffer;
+ file_buffer.bits=BITS_SAVED;
+ }
+ file_buffer.bitbucket= 0;
+}
+
+
+static int flush_buffer(ulong neaded_length)
+{
+ ulong length;
+
+ /*
+ file_buffer.end is 8 bytes lower than the real end of the buffer.
+ This is done so that the end-of-buffer condition does not need to be
+ checked for every uchar (see write_bits()). Consequently,
+ file_buffer.pos can become greater than file_buffer.end. The
+ algorithms in the other functions ensure that there will never be
+ more than 8 bytes written to the buffer without an end-of-buffer
+ check. So the buffer cannot be overrun. But we need to check for the
+ near-to-buffer-end condition to avoid a negative result, which is
+ casted to unsigned and thus becomes giant.
+ */
+ if ((file_buffer.pos < file_buffer.end) &&
+ ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
+ return 0;
+ length=(ulong) (file_buffer.pos-file_buffer.buffer);
+ file_buffer.pos=file_buffer.buffer;
+ file_buffer.pos_in_file+=length;
+ if (test_only)
+ return 0;
+ if (error_on_write|| my_write(file_buffer.file,
+ (const uchar*) file_buffer.buffer,
+ length,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ error_on_write=1;
+ return 1;
+ }
+
+ if (neaded_length != ~(ulong) 0 &&
+ (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
+ {
+ char *tmp;
+ neaded_length+=256; /* some margin */
+ tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
+ if (!tmp)
+ return 1;
+ file_buffer.pos= ((uchar*) tmp +
+ (ulong) (file_buffer.pos - file_buffer.buffer));
+ file_buffer.buffer= (uchar*) tmp;
+ file_buffer.end= (uchar*) (tmp+neaded_length-8);
+ }
+ return 0;
+}
+
+
+static void end_file_buffer(void)
+{
+ my_free((uchar*) file_buffer.buffer,MYF(0));
+}
+
+ /* output `bits` low bits of `value' */
+
+static void write_bits(register ulonglong value, register uint bits)
+{
+ DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
+ (bits == 8 * sizeof(value)));
+
+ if ((file_buffer.bits-= (int) bits) >= 0)
+ {
+ file_buffer.bitbucket|= value << file_buffer.bits;
+ }
+ else
+ {
+ reg3 ulonglong bit_buffer;
+ bits= (uint) -file_buffer.bits;
+ bit_buffer= (file_buffer.bitbucket |
+ ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
+#if BITS_SAVED == 64
+ *file_buffer.pos++= (uchar) (bit_buffer >> 56);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 48);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 40);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 32);
+#endif
+ *file_buffer.pos++= (uchar) (bit_buffer >> 24);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 16);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 8);
+ *file_buffer.pos++= (uchar) (bit_buffer);
+
+ if (bits != 8 * sizeof(value))
+ value&= (((ulonglong) 1) << bits) - 1;
+ if (file_buffer.pos >= file_buffer.end)
+ VOID(flush_buffer(~ (ulong) 0));
+ file_buffer.bits=(int) (BITS_SAVED - bits);
+ file_buffer.bitbucket= value << (BITS_SAVED - bits);
+ }
+ return;
+}
+
+ /* Flush bits in bit_buffer to buffer */
+
+static void flush_bits(void)
+{
+ int bits;
+ ulonglong bit_buffer;
+
+ bits= file_buffer.bits & ~7;
+ bit_buffer= file_buffer.bitbucket >> bits;
+ bits= BITS_SAVED - bits;
+ while (bits > 0)
+ {
+ bits-= 8;
+ *file_buffer.pos++= (uchar) (bit_buffer >> bits);
+ }
+ if (file_buffer.pos >= file_buffer.end)
+ VOID(flush_buffer(~ (ulong) 0));
+ file_buffer.bits= BITS_SAVED;
+ file_buffer.bitbucket= 0;
+}
+
+
+/****************************************************************************
+** functions to handle the joined files
+****************************************************************************/
+
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
+ my_off_t new_length,
+ ha_checksum crc)
+{
+ MARIA_SHARE *share=isam_file->s;
+ uint options=mi_uint2korr(share->state.header.options);
+ uint key;
+ DBUG_ENTER("save_state");
+
+ options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
+ mi_int2store(share->state.header.options,options);
+ /* Save the original file type of we have to undo the packing later */
+ share->state.header.org_data_file_type= share->state.header.data_file_type;
+ share->state.header.data_file_type= COMPRESSED_RECORD;
+
+ share->state.state.data_file_length=new_length;
+ share->state.state.del=0;
+ share->state.state.empty=0;
+ share->state.dellink= HA_OFFSET_ERROR;
+ share->state.split=(ha_rows) mrg->records;
+ share->state.version=(ulong) time((time_t*) 0);
+ if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+ {
+ /*
+ Some indexes are disabled, cannot use current key_file_length value
+ as an estimate of upper bound of index file size. Use packed data file
+ size instead.
+ */
+ share->state.state.key_file_length= new_length;
+ }
+ /*
+ If there are no disabled indexes, keep key_file_length value from
+ original file so "maria_chk -rq" can use this value (this is necessary
+ because index size cannot be easily calculated for fulltext keys)
+ */
+ maria_clear_all_keys_active(share->state.key_map);
+ for (key=0 ; key < share->base.keys ; key++)
+ share->state.key_root[key]= HA_OFFSET_ERROR;
+ share->state.key_del= HA_OFFSET_ERROR;
+ isam_file->state->checksum=crc; /* Save crc here */
+ share->changed=1; /* Force write of header */
+ share->state.open_count=0;
+ share->global_changed=0;
+ VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)));
+ if (share->base.keys)
+ isamchk_neaded=1;
+ DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
+ &share->state, (1 + 2)));
+}
+
+
+static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
+ ha_checksum crc)
+{
+ MARIA_STATE_INFO state;
+ MARIA_HA *isam_file=mrg->file[0];
+ uint options;
+ DBUG_ENTER("save_state_mrg");
+
+ state= isam_file->s->state;
+ options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_READ_ONLY_DATA);
+ mi_int2store(state.header.options,options);
+ state.state.data_file_length=new_length;
+ state.state.del=0;
+ state.state.empty=0;
+ state.state.records=state.split=(ha_rows) mrg->records;
+ /* See comment above in save_state about key_file_length handling. */
+ if (mrg->src_file_has_indexes_disabled)
+ {
+ isam_file->s->state.state.key_file_length=
+ max(isam_file->s->state.state.key_file_length, new_length);
+ }
+ state.dellink= HA_OFFSET_ERROR;
+ state.version=(ulong) time((time_t*) 0);
+ maria_clear_all_keys_active(state.key_map);
+ state.state.checksum=crc;
+ if (isam_file->s->base.keys)
+ isamchk_neaded=1;
+ state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
+ DBUG_RETURN (_ma_state_info_write_sub(file,&state,1+2));
+}
+
+
+/* reset for mrg_rrnd */
+
+static void mrg_reset(PACK_MRG_INFO *mrg)
+{
+ if (mrg->current)
+ {
+ maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
+ mrg->current=0;
+ }
+}
+
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
+{
+ int error;
+ MARIA_HA *isam_info;
+ my_off_t filepos;
+
+ if (!info->current)
+ {
+ isam_info= *(info->current=info->file);
+ info->end=info->current+info->count;
+ maria_reset(isam_info);
+ maria_extra(isam_info, HA_EXTRA_CACHE, 0);
+ if ((error= maria_scan_init(isam_info)))
+ return(error);
+ }
+ else
+ isam_info= *info->current;
+
+ for (;;)
+ {
+ if (!(error= maria_scan(isam_info, buf)) ||
+ error != HA_ERR_END_OF_FILE)
+ return (error);
+ maria_scan_end(isam_info);
+ maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
+ if (info->current+1 == info->end)
+ return(HA_ERR_END_OF_FILE);
+ info->current++;
+ isam_info= *info->current;
+ filepos=isam_info->s->pack.header_length;
+ maria_reset(isam_info);
+ maria_extra(isam_info,HA_EXTRA_CACHE, 0);
+ if ((error= maria_scan_init(isam_info)))
+ return(error);
+ }
+}
+
+
+static int mrg_close(PACK_MRG_INFO *mrg)
+{
+ uint i;
+ int error=0;
+ DBUG_ENTER("mrg_close");
+
+ for (i=0 ; i < mrg->count ; i++)
+ error|=maria_close(mrg->file[i]);
+ if (mrg->free_file)
+ my_free((uchar*) mrg->file,MYF(0));
+ DBUG_RETURN(error);
+}
+
+
+#if !defined(DBUG_OFF)
+/*
+ Fake the counts to get big Huffman codes.
+
+ SYNOPSIS
+ fakebigcodes()
+ huff_counts A pointer to the counts array.
+ end_count A pointer past the counts array.
+
+ DESCRIPTION
+
+ Huffman coding works by removing the two least frequent values from
+ the list of values and add a new value with the sum of their
+ incidences in a loop until only one value is left. Every time a
+ value is reused for a new value, it gets one more bit for its
+ encoding. Hence, the least frequent values get the longest codes.
+
+ To get a maximum code length for a value, two of the values must
+ have an incidence of 1. As their sum is 2, the next infrequent value
+ must have at least an incidence of 2, then 4, 8, 16 and so on. This
+ means that one needs 2**n bytes (values) for a code length of n
+ bits. However, using more distinct values forces the use of longer
+ codes, or reaching the code length with less total bytes (values).
+
+ To get 64(32)-bit codes, I sort the counts by decreasing incidence.
+ I assign counts of 1 to the two most frequent values, a count of 2
+ for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
+ the remaining values get 1. That way every possible uchar has an
+ assigned code, though not all codes are used if not all uchar values
+ are present in the column.
+
+ This strategy would work with distinct column values too, but
+ requires that at least 64(32) values are present. To make things
+ easier here, I cancel all distinct column values and force byte
+ compression for all columns.
+
+ RETURN
+ void
+*/
+
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
+{
+ HUFF_COUNTS *count;
+ my_off_t *cur_count_p;
+ my_off_t *end_count_p;
+ my_off_t **cur_sort_p;
+ my_off_t **end_sort_p;
+ my_off_t *sort_counts[256];
+ my_off_t total;
+ DBUG_ENTER("fakebigcodes");
+
+ for (count= huff_counts; count < end_count; count++)
+ {
+ /*
+ Remove distinct column values.
+ */
+ if (huff_counts->tree_buff)
+ {
+ my_free((uchar*) huff_counts->tree_buff, MYF(0));
+ delete_tree(&huff_counts->int_tree);
+ huff_counts->tree_buff= NULL;
+ DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
+ }
+
+ /*
+ Sort counts by decreasing incidence.
+ */
+ cur_count_p= count->counts;
+ end_count_p= cur_count_p + 256;
+ cur_sort_p= sort_counts;
+ while (cur_count_p < end_count_p)
+ *(cur_sort_p++)= cur_count_p++;
+ (void) qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
+
+ /*
+ Assign faked counts.
+ */
+ cur_sort_p= sort_counts;
+#if SIZEOF_LONG_LONG > 4
+ end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
+#else
+ end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
+#endif
+ /* Most frequent value gets a faked count of 1. */
+ **(cur_sort_p++)= 1;
+ total= 1;
+ while (cur_sort_p < end_sort_p)
+ {
+ **(cur_sort_p++)= total;
+ total<<= 1;
+ }
+ /* Set the last value. */
+ **(cur_sort_p++)= --total;
+ /*
+ Set the remaining counts.
+ */
+ end_sort_p= sort_counts + 256;
+ while (cur_sort_p < end_sort_p)
+ **(cur_sort_p++)= 1;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Compare two counts for reverse sorting.
+
+ SYNOPSIS
+ fakecmp()
+ count1 One count.
+ count2 Another count.
+
+ RETURN
+ 1 count1 < count2
+ 0 count1 == count2
+ -1 count1 > count2
+*/
+
+static int fakecmp(my_off_t **count1, my_off_t **count2)
+{
+ return ((**count1 < **count2) ? 1 :
+ (**count1 > **count2) ? -1 : 0);
+}
+#endif
diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c
new file mode 100644
index 00000000000..a7a6370b1c4
--- /dev/null
+++ b/storage/maria/maria_read_log.c
@@ -0,0 +1,200 @@
+/* Copyright (C) 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include <my_getopt.h>
+
+#define PCACHE_SIZE (1024*1024*10)
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE (1024L*1024L)
+
+static const char *load_default_groups[]= { "maria_read_log",0 };
+static void get_options(int *argc,char * * *argv);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+const char *default_dbug_option= "d:t:i:O,\\maria_read_log.trace";
+#else
+const char *default_dbug_option= "d:t:i:o,/tmp/maria_read_log.trace";
+#endif
+#endif /* DBUG_OFF */
+static my_bool opt_only_display, opt_display_and_apply;
+
+int main(int argc, char **argv)
+{
+ LSN lsn;
+ char **default_argv;
+ MY_INIT(argv[0]);
+
+ load_defaults("my", load_default_groups, &argc, &argv);
+ default_argv= argv;
+ get_options(&argc, &argv);
+
+ maria_data_root= ".";
+ maria_in_recovery= TRUE;
+
+ if (maria_init())
+ {
+ fprintf(stderr, "Can't init Maria engine (%d)\n", errno);
+ goto err;
+ }
+ /* we don't want to create a control file, it MUST exist */
+ if (ma_control_file_create_or_open())
+ {
+ fprintf(stderr, "Can't open control file (%d)\n", errno);
+ goto err;
+ }
+ if (last_logno == FILENO_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Can't find any log\n");
+ goto err;
+ }
+ /* same page cache for log and data; assumes same page size... */
+ DBUG_ASSERT(maria_block_size == TRANSLOG_PAGE_SIZE);
+ if (init_pagecache(maria_pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE) == 0)
+ {
+ fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno);
+ goto err;
+ }
+ /*
+ If log handler does not find the "last_logno" log it will return error,
+ which is good.
+ But if it finds a log and this log was crashed, it will create a new log,
+ which is useless. TODO: start log handler in read-only mode.
+ */
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, maria_pagecache,
+ TRANSLOG_DEFAULT_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ goto err;
+ }
+
+ if (opt_only_display)
+ printf("You are using --only-display, NOTHING will be written to disk\n");
+
+ /* LSN could be also --start-from-lsn=# */
+ lsn= translog_first_lsn_in_log();
+ if (lsn == LSN_ERROR)
+ {
+ fprintf(stderr, "Opening transaction log failed\n");
+ goto end;
+ }
+ if (lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stdout, "The transaction log is empty\n");
+ }
+ fprintf(stdout, "The transaction log starts from lsn (%lu,0x%lx)\n",
+ LSN_IN_PARTS(lsn));
+
+ fprintf(stdout, "TRACE of the last maria_read_log\n");
+ if (maria_apply_log(lsn, opt_display_and_apply, stdout,
+ opt_display_and_apply, FALSE))
+ goto err;
+ fprintf(stdout, "%s: SUCCESS\n", my_progname);
+
+ goto end;
+err:
+ /* don't touch anything more, in case we hit a bug */
+ fprintf(stderr, "%s: FAILED\n", my_progname);
+ exit(1);
+end:
+ maria_end();
+ free_defaults(default_argv);
+ my_end(0);
+ exit(0);
+ return 0; /* No compiler warning */
+}
+
+
+static struct my_option my_long_options[] =
+{
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"only-display", 'o', "display brief info about records's header",
+ (uchar **) &opt_only_display, (uchar **) &opt_only_display, 0, GET_BOOL,
+ NO_ARG,0, 0, 0, 0, 0, 0},
+ {"display-and-apply", 'a',
+ "like --only-display but displays more info and modifies tables",
+ (uchar **) &opt_display_and_apply, (uchar **) &opt_display_and_apply, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ VOID(printf("%s Ver 1.0 for %s on %s\n",
+ my_progname, SYSTEM_TYPE, MACHINE_TYPE));
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("Copyright (C) 2007 MySQL AB");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Display and apply log records from a MARIA transaction log");
+ puts("found in the current directory (for now)");
+ VOID(printf("\nUsage: %s OPTIONS\n", my_progname));
+ puts("You need to use one of -o or -a");
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+ const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch (optid) {
+ case '?':
+ usage();
+ exit(0);
+#ifndef DBUG_OFF
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+ break;
+#endif
+ }
+ return 0;
+}
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ my_progname= argv[0][0];
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if ((opt_only_display + opt_display_and_apply) != 1)
+ {
+ usage();
+ exit(1);
+ }
+}
diff --git a/storage/maria/maria_rename.sh b/storage/maria/maria_rename.sh
new file mode 100755
index 00000000000..fb20e47e635
--- /dev/null
+++ b/storage/maria/maria_rename.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+replace myisam maria MYISAM MARIA MyISAM MARIA -- mysql-test/t/*maria*test mysql-test/r/*maria*result
+
+FILES=`echo sql/ha_maria.{cc,h} include/maria*h storage/maria/*.{c,h}`
+
+replace myisam maria MYISAM MARIA MyISAM MARIA myisam.h maria.h myisamdef.h maria_def.h mi_ maria_ ft_ maria_ft_ "Copyright (C) 2000" "Copyright (C) 2006" MI_ISAMINFO MARIA_INFO MI_CREATE_INFO MARIA_CREATE_INFO maria_isam_ maria_ MI_INFO MARIA_HA MI_ MARIA_ MARIACHK MARIA_CHK rt_index.h ma_rt_index.h rtree_ maria_rtree rt_key.h ma_rt_key.h rt_mbr.h ma_rt_mbr.h -- $FILES
+
+replace check_table_is_closed _ma_check_table_is_closed test_if_reopen _ma_test_if_reopen my_n_base_info_read maria_n_base_info_read update_auto_increment _ma_update_auto_increment save_pack_length _ma_save_packlength calc_pack_length _ma_calc_pack_length -- $FILES
+
+replace mi_ ma_ ft_ ma_ft_ rt_ ma_rt_ myisam maria myisamchk maria_chk myisampack maria_pack myisamlog maria_log -- storage/maria/Makefile.am
+
+#
+# Restore wrong replaces
+#
+
+replace maria_sint1korr mi_sint1korr maria_uint1korr mi_uint1korr maria_sint2korr mi_sint2korr maria_sint3korr mi_sint3korr maria_sint4korr mi_sint4korr maria_sint8korr mi_sint8korr maria_uint2korr mi_uint2korr maria_uint3korr mi_uint3korr maria_uint4korr mi_uint4korr maria_uint5korr mi_uint5korr maria_uint6korr mi_uint6korr maria_uint7korr mi_uint7korr maria_uint8korr mi_uint8korr maria_int1store mi_int1store maria_int2store mi_int2store maria_int3store mi_int3store maria_int4store mi_int4store maria_int5store mi_int5store maria_int6store mi_int6store maria_int7store mi_int7store maria_int8store mi_int8store maria_float4store mi_float4store maria_float4get mi_float4get maria_float8store mi_float8store maria_float8get mi_float8get maria_rowstore mi_rowstore maria_rowkorr mi_rowkorr maria_sizestore mi_sizestore maria_sizekorr mi_sizekorr _maria_maria_ _maria MARIA_MAX_POSSIBLE_KEY HA_MAX_POSSIBLE_KEY MARIA_MAX_KEY_BUFF HA_MAX_KEY_BUFF MARIA_MAX_KEY_SEG HA_MAX_KEY_SEG maria_ft_sintXkorr ft_sintXkorr maria_ft_intXstore ft_intXstore maria_ft_boolean_syntax ft_boolean_syntax maria_ft_min_word_len ft_min_word_len maria_ft_max_word_len ft_max_word_len -- $FILES
diff --git a/storage/maria/plug.in b/storage/maria/plug.in
new file mode 100644
index 00000000000..1ce64f6e2bb
--- /dev/null
+++ b/storage/maria/plug.in
@@ -0,0 +1,8 @@
+MYSQL_STORAGE_ENGINE(maria,, [Maria Storage Engine],
+ [Traditional transactional MySQL tables], [max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(maria, [storage/maria])
+MYSQL_PLUGIN_ACTIONS(maria, [AC_CONFIG_FILES(storage/maria/unittest/Makefile)])
+MYSQL_PLUGIN_STATIC(maria, [libmaria.a])
+# Maria will probably go first into max builds, not all builds,
+# so we don't declare it mandatory.
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(maria, [ha_maria.cc])
diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c
new file mode 100644
index 00000000000..eb8da1d6865
--- /dev/null
+++ b/storage/maria/tablockman.c
@@ -0,0 +1,676 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: automatically place S instead of LS if possible */
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <my_base.h>
+#include <hash.h>
+#include "tablockman.h"
+
+/*
+ Lock Manager for Table Locks
+
+ The code below handles locks on resources - but it is optimized for a
+ case when a number of resources is not very large, and there are many of
+ locks per resource - that is a resource is likely to be a table or a
+ database, but hardly a row in a table.
+
+ Locks belong to "lock owners". A Lock Owner is uniquely identified by a
+ 16-bit number - loid (lock owner identifier). A function loid_to_tlo must
+ be provided by the application that takes such a number as an argument
+ and returns a TABLE_LOCK_OWNER structure.
+
+ Lock levels are completely defined by three tables. Lock compatibility
+ matrix specifies which locks can be held at the same time on a resource.
+ Lock combining matrix specifies what lock level has the same behaviour as
+ a pair of two locks of given levels. getlock_result matrix simplifies
+ intention locking and lock escalation for an application, basically it
+ defines which locks are intention locks and which locks are "loose"
+ locks. It is only used to provide better diagnostics for the
+ application, lock manager itself does not differentiate between normal,
+ intention, and loose locks.
+
+ The assumptions are: few distinct resources, many locks are held at the
+ same time on one resource. Thus: a lock structure _per resource_ can be
+ rather large; a lock structure _per lock_ does not need to be very small
+ either; we need to optimize for _speed_. Operations we need are: place a
+ lock, check if a particular transaction already has a lock on this
+ resource, check if a conflicting lock exists, if yes - find who owns it.
+
+ Solution: every resource has a structure with
+ 1. Hash of latest (see the lock upgrade section below) granted locks with
+ loid as a key. Thus, checking if a given transaction has a lock on
+ this resource is O(1) operation.
+ 2. Doubly-linked lists of all granted locks - one list for every lock
+ type. Thus, checking if a conflicting lock exists is a check whether
+ an appropriate list head pointer is not null, also O(1).
+ 3. Every lock has a loid of the owner, thus checking who owns a
+ conflicting lock is also O(1).
+ 4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo,
+ because for lock upgrades requests are added to the queue head, not
+ tail. This is a single place where there it gets O(N) on number
+ of locks - when a transaction wakes up from waiting on a condition,
+ it may need to scan the queue backward to the beginning to find
+ a conflicting lock. It is guaranteed though that "all transactions
+ before it" received the same - or earlier - signal. In other words a
+ transaction needs to scan all transactions before it that received the
+ signal but didn't have a chance to resume the execution yet, so
+ practically OS scheduler won't let the scan to be O(N).
+
+ Waiting: if there is a conflicting lock or if wait queue is not empty, a
+ requested lock cannot be granted at once. It is added to the end of the
+ wait queue. If a queue was empty and there is a conflicting lock - the
+ "blocker" transaction is the owner of this lock. If a queue is not empty,
+ an owner of the previous lock in the queue is the "blocker". But if the
+ previous lock is compatible with the request, then the "blocker" is the
+ transaction that the owner of the lock at the end of the queue is waiting
+ for (in other words, our lock is added to the end of the wait queue, and
+ our blocker is the same as of the lock right before us).
+
+ Lock upgrades: when a thread that has a lock on a given resource,
+ requests a new lock on the same resource and the old lock is not enough
+ to satisfy new lock requirements (which is defined by
+ lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock
+ (defined by lock_combining_matrix as above) is placed. Depending on
+ other granted locks it is immediately granted or it has to wait. Here the
+ lock is added to the start of the waiting queue, not to the end. Old
+ lock, is removed from the hash, but not from the doubly-linked lists.
+ (indeed, a transaction checks "do I have a lock on this resource ?" by
+ looking in a hash, and it should find a latest lock, so old locks must be
+ removed; but a transaction checks "are there conflicting locks ?" by
+ checking doubly-linked lists, it doesn't matter if it will find an old
+ lock - if it would be removed, a new lock would be also a conflict).
+ So, a hash contains only "latest" locks - there can be only one latest
+ lock per resource per transaction. But doubly-linked lists contain all
+ locks, even "obsolete" ones, because it doesnt't hurt. Note that old
+ locks can not be freed early, in particular they stay in the
+ 'active_locks' list of a lock owner, because they may be "re-enabled"
+ on a savepoint rollback.
+
+ To better support table-row relations where one needs to lock the table
+ with an intention lock before locking the row, extended diagnostics is
+ provided. When an intention lock (presumably on a table) is granted,
+ lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+ perhaps the thread already has a normal lock on this table),
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+ whether it's possible to lock the row, but no need to lock it - perhaps
+ the thread has a loose lock on this table). This is defined by
+ getlock_result[] table.
+
+ Instant duration locks are not supported. Though they're trivial to add,
+ they are normally only used on rows, not on tables. So, presumably,
+ they are not needed here.
+
+ Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes
+ (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex).
+ table mutex protects operations on the table lock structures, and lock
+ owner pointers waiting_for and waiting_for_loid.
+ lock owner mutex is only used to wait on lock owner condition
+ (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock
+ structures, and only lock owner itself may access them.
+ The pool mutex protects a pool of unused locks. Note the locking order:
+ first the table mutex, then the owner mutex or a pool mutex.
+ Table mutex lock cannot be attempted when owner or pool mutex are locked.
+ No mutex lock can be attempted if owner or pool mutex are locked.
+*/
+
+/*
+ Lock compatibility matrix.
+
+ It's asymmetric. Read it as "Somebody has the lock <value in the row
+ label>, can I set the lock <value in the column label> ?"
+
+ ') Though you can take LS lock while somebody has S lock, it makes no
+ sense - it's simpler to take S lock too.
+
+ 1 - compatible
+ 0 - incompatible
+ -1 - "impossible", so that we can assert the impossibility.
+*/
+static const int lock_compatibility_matrix[10][10]=
+{ /* N S X IS IX SIX LS LX SLX LSIX */
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */
+ { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */
+ { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */
+};
+
+/*
+ Lock combining matrix.
+
+ It's symmetric. Read it as "what lock level L is identical to the
+ set of two locks A and B"
+
+ One should never get N from it, we assert the impossibility
+*/
+static const enum lock_type lock_combining_matrix[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { N, N, N, N, N, N, N, N, N, N}, /* N */
+ { N, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */
+ { N, X, X, X, X, X, X, X, X, X}, /* X */
+ { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */
+ { N, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */
+ { N, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */
+ { N, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */
+ { N, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */
+ { N, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */
+ { N, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */
+};
+
+/*
+ the return codes for lockman_getlock
+
+ It's asymmetric. Read it as "I have the lock <value in the row label>,
+ what value should be returned for <value in the column label> ?"
+
+ 0 means impossible combination (assert!)
+
+ Defines below help to preserve the table structure.
+ I/L/A values are self explanatory
+ x means the combination is possible (assert should not crash)
+ but it cannot happen in row locks, only in table locks (S,X),
+ or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static const enum lockman_getlock_result getlock_result[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */
+ { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */
+ { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */
+ { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */
+ { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */
+ { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */
+ { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */
+ { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */
+ { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */
+ { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+/*
+ this structure is optimized for a case when there're many locks
+ on the same resource - e.g. a table
+*/
+
+struct st_table_lock {
+ /* QQ: do we need upgraded_from ? */
+ struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev;
+ struct st_locked_table *table;
+ uint16 loid;
+ uchar lock_type;
+};
+
+#define hash_insert my_hash_insert /* for consistency :) */
+
+static inline
+TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid)
+{
+ return (TABLE_LOCK *)hash_search(& table->latest_locks,
+ (uchar *)& loid, sizeof(loid));
+}
+
+static inline
+void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table)
+{
+ DBUG_ASSERT(table == lock->table);
+ if (lock->prev)
+ {
+ DBUG_ASSERT(table->wait_queue_out != lock);
+ lock->prev->next= lock->next;
+ }
+ else
+ {
+ DBUG_ASSERT(table->wait_queue_out == lock);
+ table->wait_queue_out= lock->next;
+ }
+ if (lock->next)
+ {
+ DBUG_ASSERT(table->wait_queue_in != lock);
+ lock->next->prev= lock->prev;
+ }
+ else
+ {
+ DBUG_ASSERT(table->wait_queue_in == lock);
+ table->wait_queue_in= lock->prev;
+ }
+}
+
+/*
+ DESCRIPTION
+ tries to lock a resource 'table' with a lock level 'lock'.
+
+ RETURN
+ see enum lockman_getlock_result
+*/
+enum lockman_getlock_result
+tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo,
+ LOCKED_TABLE *table, enum lock_type lock)
+{
+ TABLE_LOCK *old, *new, *blocker, *blocker2;
+ TABLE_LOCK_OWNER *wait_for;
+ ulonglong deadline;
+ struct timespec timeout;
+ enum lock_type new_lock;
+ enum lockman_getlock_result res;
+ int i;
+
+ DBUG_ASSERT(lo->waiting_lock == 0);
+ DBUG_ASSERT(lo->waiting_for == 0);
+ DBUG_ASSERT(lo->waiting_for_loid == 0);
+
+ pthread_mutex_lock(& table->mutex);
+ /* do we already have a lock on this resource ? */
+ old= find_by_loid(table, lo->loid);
+
+ /* calculate the level of the upgraded lock, if yes */
+ new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock;
+
+ /* and check if old lock is enough to satisfy the new request */
+ if (old && new_lock == old->lock_type)
+ {
+ /* yes */
+ res= getlock_result[old->lock_type][lock];
+ goto ret;
+ }
+
+ /* no, placing a new lock. first - take a free lock structure from the pool */
+ pthread_mutex_lock(& lm->pool_mutex);
+ new= lm->pool;
+ if (new)
+ {
+ lm->pool= new->next;
+ pthread_mutex_unlock(& lm->pool_mutex);
+ }
+ else
+ {
+ pthread_mutex_unlock(& lm->pool_mutex);
+ new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME));
+ if (unlikely(!new))
+ {
+ res= NO_MEMORY_FOR_LOCK;
+ goto ret;
+ }
+ }
+
+ new->loid= lo->loid;
+ new->lock_type= new_lock;
+ new->table= table;
+
+ /* and try to place it */
+ for (new->prev= table->wait_queue_in;;)
+ {
+ wait_for= 0;
+ if (!old)
+ {
+ /* not upgrading - a lock must be added to the _end_ of the wait queue */
+ for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev)
+ {
+ TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid);
+
+ /* find a blocking lock */
+ DBUG_ASSERT(table->wait_queue_out);
+ DBUG_ASSERT(table->wait_queue_in);
+ if (!lock_compatibility_matrix[blocker->lock_type][lock])
+ {
+ /* found! */
+ wait_for= tmp;
+ break;
+ }
+
+ /*
+ hmm, the lock before doesn't block us, let's look one step further.
+ the condition below means:
+
+ if we never waited on a condition yet
+ OR
+ the lock before ours (blocker) waits on a lock (blocker2) that is
+ present in the hash AND and conflicts with 'blocker'
+
+ the condition after OR may fail if 'blocker2' was removed from
+ the hash, its signal woke us up, but 'blocker' itself didn't see
+ the signal yet.
+ */
+ if (!lo->waiting_lock ||
+ ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) &&
+ !lock_compatibility_matrix[blocker2->lock_type]
+ [blocker->lock_type]))
+ {
+ /* but it's waiting for a real lock. we'll wait for the same lock */
+ wait_for= tmp->waiting_for;
+ /*
+ We don't really need tmp->waiting_for, as tmp->waiting_for_loid
+ is enough. waiting_for is just a local cache to avoid calling
+ loid_to_tlo().
+ But it's essensial that tmp->waiting_for pointer can ONLY
+ be dereferenced if find_by_loid() above returns a non-null
+ pointer, because a TABLE_LOCK_OWNER object that it points to
+ may've been freed when we come here after a signal.
+ In particular tmp->waiting_for_loid cannot be replaced
+ with tmp->waiting_for->loid.
+ */
+ DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid));
+ break;
+ }
+
+ /*
+ otherwise - a lock it's waiting for doesn't exist.
+ We've no choice but to scan the wait queue backwards, looking
+ for a conflicting lock or a lock waiting for a real lock.
+ QQ is there a way to avoid this scanning ?
+ */
+ }
+ }
+
+ if (wait_for == 0)
+ {
+ /* checking for compatibility with existing locks */
+ for (blocker= 0, i= 0; i < LOCK_TYPES; i++)
+ {
+ if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock])
+ {
+ blocker= table->active_locks[i];
+ /* if the first lock in the list is our own - skip it */
+ if (blocker->loid == lo->loid)
+ blocker= blocker->next;
+ if (blocker) /* found a conflicting lock, need to wait */
+ break;
+ }
+ }
+ if (!blocker) /* free to go */
+ break;
+ wait_for= lm->loid_to_tlo(blocker->loid);
+ }
+
+ /* ok, we're here - the wait is inevitable */
+ lo->waiting_for= wait_for;
+ lo->waiting_for_loid= wait_for->loid;
+ if (!lo->waiting_lock) /* first iteration of the for() loop */
+ {
+ /* lock upgrade or new lock request ? */
+ if (old)
+ {
+ /* upgrade - add the lock to the _start_ of the wait queue */
+ new->prev= 0;
+ if ((new->next= table->wait_queue_out))
+ new->next->prev= new;
+ table->wait_queue_out= new;
+ if (!table->wait_queue_in)
+ table->wait_queue_in= table->wait_queue_out;
+ }
+ else
+ {
+ /* new lock - add the lock to the _end_ of the wait queue */
+ new->next= 0;
+ if ((new->prev= table->wait_queue_in))
+ new->prev->next= new;
+ table->wait_queue_in= new;
+ if (!table->wait_queue_out)
+ table->wait_queue_out= table->wait_queue_in;
+ }
+ lo->waiting_lock= new;
+
+ deadline= my_getsystime() + lm->lock_timeout * 10000;
+ timeout.tv_sec= deadline/10000000;
+ timeout.tv_nsec= (deadline % 10000000) * 100;
+ }
+
+ /*
+ prepare to wait.
+ we must lock blocker's mutex to wait on blocker's cond.
+ and we must release table's mutex.
+ note that blocker's mutex is locked _before_ table's mutex is released
+ */
+ pthread_mutex_lock(wait_for->mutex);
+ pthread_mutex_unlock(& table->mutex);
+
+ /* now really wait */
+ i= pthread_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout);
+
+ pthread_mutex_unlock(wait_for->mutex);
+
+ if (i == ETIMEDOUT || i == ETIME)
+ {
+ /* we rely on the caller to rollback and release all locks */
+ res= LOCK_TIMEOUT;
+ goto ret2;
+ }
+
+ pthread_mutex_lock(& table->mutex);
+
+ /* ... and repeat from the beginning */
+ }
+ /* yeah! we can place the lock now */
+
+ /* remove the lock from the wait queue, if it was there */
+ if (lo->waiting_lock)
+ {
+ remove_from_wait_queue(new, table);
+ lo->waiting_lock= 0;
+ lo->waiting_for= 0;
+ lo->waiting_for_loid= 0;
+ }
+
+ /* add it to the list of all locks of this lock owner */
+ new->next_in_lo= lo->active_locks;
+ lo->active_locks= new;
+
+ /* and to the list of active locks of this lock type */
+ new->prev= 0;
+ if ((new->next= table->active_locks[new_lock-1]))
+ new->next->prev= new;
+ table->active_locks[new_lock-1]= new;
+
+ /* update the latest_locks hash */
+ if (old)
+ hash_delete(& table->latest_locks, (uchar *)old);
+ hash_insert(& table->latest_locks, (uchar *)new);
+
+ new->upgraded_from= old;
+
+ res= getlock_result[lock][lock];
+
+ret:
+ pthread_mutex_unlock(& table->mutex);
+ret2:
+ DBUG_ASSERT(res);
+ return res;
+}
+
+/*
+ DESCRIPTION
+ release all locks belonging to a transaction.
+ signal waiters to continue
+*/
+void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+ TABLE_LOCK *lock, *local_pool= 0, *local_pool_end;
+
+ /*
+ instead of adding released locks to a pool one by one, we'll link
+ them in a list and add to a pool in one short action (under a mutex)
+ */
+ local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks;
+ if (!local_pool_end)
+ return;
+
+ /* release a waiting lock, if any */
+ if ((lock= lo->waiting_lock))
+ {
+ DBUG_ASSERT(lock->loid == lo->loid);
+ pthread_mutex_lock(& lock->table->mutex);
+ remove_from_wait_queue(lock, lock->table);
+
+ /*
+ a special case: if this lock was not the last in the wait queue
+ and it's compatible with the next lock, than the next lock
+ is waiting for our blocker though really it waits for us, indirectly.
+ Signal our blocker to release this next lock (after we removed our
+ lock from the wait queue, of course).
+ */
+ /*
+ An example to clarify the above:
+ trn1> S-lock the table. Granted.
+ trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1
+ trn3> IS-lock the table. The queue is not empty, so IS-lock is added
+ to the queue. It's compatible with the waiting IX-lock, so trn3
+ waits for trn2->waiting_for, that is trn1.
+ if trn1 releases the lock it signals trn1->cond and both waiting
+ transactions are awaken. But if trn2 times out, trn3 must be notified
+ too (as IS and S locks are compatible). So trn2 must signal trn1->cond.
+ */
+ if (lock->next &&
+ lock_compatibility_matrix[lock->next->lock_type][lock->lock_type])
+ {
+ pthread_mutex_lock(lo->waiting_for->mutex);
+ pthread_cond_broadcast(lo->waiting_for->cond);
+ pthread_mutex_unlock(lo->waiting_for->mutex);
+ }
+ lo->waiting_for= 0;
+ lo->waiting_for_loid= 0;
+ pthread_mutex_unlock(& lock->table->mutex);
+
+ lock->next= local_pool;
+ local_pool= lock;
+ }
+
+ /* now release granted locks */
+ lock= lo->active_locks;
+ while (lock)
+ {
+ TABLE_LOCK *cur= lock;
+ pthread_mutex_t *mutex= & lock->table->mutex;
+ DBUG_ASSERT(cur->loid == lo->loid);
+
+ DBUG_ASSERT(lock != lock->next_in_lo);
+ lock= lock->next_in_lo;
+
+ /* TODO ? group locks by table to reduce the number of mutex locks */
+ pthread_mutex_lock(mutex);
+ hash_delete(& cur->table->latest_locks, (uchar *)cur);
+
+ if (cur->prev)
+ cur->prev->next= cur->next;
+ if (cur->next)
+ cur->next->prev= cur->prev;
+ if (cur->table->active_locks[cur->lock_type-1] == cur)
+ cur->table->active_locks[cur->lock_type-1]= cur->next;
+
+ cur->next= local_pool;
+ local_pool= cur;
+
+ pthread_mutex_unlock(mutex);
+ }
+
+ lo->waiting_lock= lo->active_locks= 0;
+
+ /*
+ okay, all locks released. now signal that we're leaving,
+ in case somebody's waiting for it
+ */
+ pthread_mutex_lock(lo->mutex);
+ pthread_cond_broadcast(lo->cond);
+ pthread_mutex_unlock(lo->mutex);
+
+ /* and push all freed locks to the lockman's pool */
+ pthread_mutex_lock(& lm->pool_mutex);
+ local_pool_end->next= lm->pool;
+ lm->pool= local_pool;
+ pthread_mutex_unlock(& lm->pool_mutex);
+}
+
+void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout)
+{
+ lm->pool= 0;
+ lm->loid_to_tlo= func;
+ lm->lock_timeout= timeout;
+ pthread_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST);
+ my_getsystime(); /* ensure that my_getsystime() is initialized */
+}
+
+void tablockman_destroy(TABLOCKMAN *lm)
+{
+ while (lm->pool)
+ {
+ TABLE_LOCK *tmp= lm->pool;
+ lm->pool= tmp->next;
+ my_free((void *)tmp, MYF(0));
+ }
+ pthread_mutex_destroy(& lm->pool_mutex);
+}
+
+/*
+ initialize a LOCKED_TABLE structure
+
+ SYNOPSYS
+ lt a LOCKED_TABLE to initialize
+ initial_hash_size initial size for 'latest_locks' hash
+*/
+void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size)
+{
+ bzero(lt, sizeof(*lt));
+ pthread_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST);
+ hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size,
+ offsetof(TABLE_LOCK, loid),
+ sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0);
+}
+
+void tablockman_destroy_locked_table(LOCKED_TABLE *lt)
+{
+ int i;
+
+ DBUG_ASSERT(lt->wait_queue_out == 0);
+ DBUG_ASSERT(lt->wait_queue_in == 0);
+ DBUG_ASSERT(lt->latest_locks.records == 0);
+ for (i= 0; i<LOCK_TYPES; i++)
+ DBUG_ASSERT(lt->active_locks[i] == 0);
+
+ hash_free(& lt->latest_locks);
+ pthread_mutex_destroy(& lt->mutex);
+}
+
+#ifdef EXTRA_DEBUG
+static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX",
+ "LS", "LX", "SLX", "LSIX"};
+
+void tablockman_print_tlo(TABLE_LOCK_OWNER *lo)
+{
+ TABLE_LOCK *lock;
+
+ printf("lo%d>", lo->loid);
+ if ((lock= lo->waiting_lock))
+ printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table);
+ for (lock= lo->active_locks;
+ lock && lock != lock->next_in_lo;
+ lock= lock->next_in_lo)
+ printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table);
+ if (lock && lock == lock->next_in_lo)
+ printf("!");
+ printf("\n");
+}
+#endif
+
diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h
new file mode 100644
index 00000000000..58c852b5a21
--- /dev/null
+++ b/storage/maria/tablockman.h
@@ -0,0 +1,87 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _tablockman_h
+#define _tablockman_h
+
+/*
+ Lock levels:
+ ^^^^^^^^^^^
+
+ N - "no lock", not a lock, used sometimes internally to simplify the code
+ S - Shared
+ X - eXclusive
+ IS - Intention Shared
+ IX - Intention eXclusive
+ SIX - Shared + Intention eXclusive
+ LS - Loose Shared
+ LX - Loose eXclusive
+ SLX - Shared + Loose eXclusive
+ LSIX - Loose Shared + Intention eXclusive
+*/
+#ifndef _lockman_h
+/* QQ: TODO remove N-locks */
+enum lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+enum lockman_getlock_result {
+ NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+ GOT_THE_LOCK,
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+#endif
+
+#define LOCK_TYPES (LOCK_TYPE_LAST-1)
+
+typedef struct st_table_lock TABLE_LOCK;
+
+typedef struct st_table_lock_owner {
+ TABLE_LOCK *active_locks; /* list of active locks */
+ TABLE_LOCK *waiting_lock; /* waiting lock (one lock only) */
+ struct st_table_lock_owner *waiting_for; /* transaction we're waiting for */
+ pthread_cond_t *cond; /* transactions waiting for us, wait on 'cond' */
+ pthread_mutex_t *mutex; /* mutex is required to use 'cond' */
+ uint16 loid, waiting_for_loid; /* Lock Owner IDentifier */
+} TABLE_LOCK_OWNER;
+
+typedef struct st_locked_table {
+ pthread_mutex_t mutex; /* mutex for everything below */
+ HASH latest_locks; /* latest locks in a hash */
+ TABLE_LOCK *active_locks[LOCK_TYPES]; /* dl-list of locks per type */
+ TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/
+} LOCKED_TABLE;
+
+typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16);
+
+typedef struct {
+ pthread_mutex_t pool_mutex;
+ TABLE_LOCK *pool; /* lifo pool of free locks */
+ uint lock_timeout; /* lock timeout in milliseconds */
+ loid_to_tlo_func *loid_to_tlo; /* for mapping loid to TABLE_LOCK_OWNER */
+} TABLOCKMAN;
+
+void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint);
+void tablockman_destroy(TABLOCKMAN *);
+enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *,
+ LOCKED_TABLE *, enum lock_type);
+void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *);
+void tablockman_init_locked_table(LOCKED_TABLE *, int);
+void tablockman_destroy_locked_table(LOCKED_TABLE *);
+
+#ifdef EXTRA_DEBUG
+void tablockman_print_tlo(TABLE_LOCK_OWNER *);
+#endif
+
+#endif
+
diff --git a/storage/maria/test_pack b/storage/maria/test_pack
new file mode 100755
index 00000000000..689645b1661
--- /dev/null
+++ b/storage/maria/test_pack
@@ -0,0 +1,10 @@
+silent="-s"
+suffix=""
+
+ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1
+
+ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
+ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c
new file mode 100644
index 00000000000..03d11db3b5b
--- /dev/null
+++ b/storage/maria/trnman.c
@@ -0,0 +1,743 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "trnman.h"
+#include "ma_checkpoint.h"
+#include "ma_control_file.h"
+
+/*
+ status variables:
+ how many trns in the active list currently,
+ in the committed list currently, allocated since startup.
+*/
+uint trnman_active_transactions, trnman_committed_transactions,
+ trnman_allocated_transactions;
+
+/* list of active transactions in the trid order */
+static TRN active_list_min, active_list_max;
+/* list of committed transactions in the trid order */
+static TRN committed_list_min, committed_list_max;
+
+/* a counter, used to generate transaction ids */
+static TrID global_trid_generator;
+
+/* the mutex for everything above */
+static pthread_mutex_t LOCK_trn_list;
+
+/* LIFO pool of unused TRN structured for reuse */
+static TRN *pool;
+
+/* a hash for committed transactions that maps trid to a TRN structure */
+static LF_HASH trid_to_committed_trn;
+
+/* an array that maps short_trid of an active transaction to a TRN structure */
+static TRN **short_trid_to_active_trn;
+
+/* locks for short_trid_to_active_trn and pool */
+static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool;
+
+/*
+ Simple interface functions
+ QQ: if they stay so simple, should we make them inline?
+*/
+
+uint trnman_increment_locked_tables(TRN *trn)
+{
+ return trn->locked_tables++;
+}
+
+my_bool trnman_has_locked_tables(TRN *trn)
+{
+ return trn->locked_tables != 0;
+}
+
+uint trnman_decrement_locked_tables(TRN *trn)
+{
+ return --trn->locked_tables;
+}
+
+void trnman_reset_locked_tables(TRN *trn)
+{
+ trn->locked_tables= 0;
+}
+
+
+/*
+ NOTE
+ Just as short_id doubles as loid, this function doubles as
+ short_trid_to_LOCK_OWNER. See the compile-time assert below.
+*/
+
+#ifdef NOT_USED
+static TRN *short_trid_to_TRN(uint16 short_trid)
+{
+ TRN *trn;
+ compile_time_assert(offsetof(TRN, locks) == 0);
+ my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn);
+ trn= my_atomic_loadptr((void **)&short_trid_to_active_trn[short_trid]);
+ my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn);
+ return (TRN *)trn;
+}
+#endif
+
+static uchar *trn_get_hash_key(const uchar *trn, size_t *len,
+ my_bool unused __attribute__ ((unused)))
+{
+ *len= sizeof(TrID);
+ return (uchar *) & ((*((TRN **)trn))->trid);
+}
+
+
+/**
+ @brief Initializes transaction manager.
+
+ @param initial_trid Generated TrIDs will start from initial_trid+1.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int trnman_init(TrID initial_trid)
+{
+ DBUG_ENTER("trnman_init");
+
+ short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*),
+ MYF(MY_WME|MY_ZEROFILL));
+ if (unlikely(!short_trid_to_active_trn))
+ DBUG_RETURN(1);
+ short_trid_to_active_trn--; /* min short_trid is 1 */
+
+ /*
+ Initialize lists.
+ active_list_max.min_read_from must be larger than any trid,
+ so that when an active list is empty we would could free
+ all committed list.
+ And committed_list_max itself can not be freed so
+ committed_list_max.commit_trid must not be smaller that
+ active_list_max.min_read_from
+ */
+
+ active_list_max.trid= active_list_min.trid= 0;
+ active_list_max.min_read_from= ~(ulong) 0;
+ active_list_max.next= active_list_min.prev= 0;
+ active_list_max.prev= &active_list_min;
+ active_list_min.next= &active_list_max;
+
+ committed_list_max.commit_trid= ~(ulong) 0;
+ committed_list_max.next= committed_list_min.prev= 0;
+ committed_list_max.prev= &committed_list_min;
+ committed_list_min.next= &committed_list_max;
+
+ trnman_active_transactions= 0;
+ trnman_committed_transactions= 0;
+ trnman_allocated_transactions= 0;
+
+ pool= 0;
+ global_trid_generator= initial_trid;
+ lf_hash_init(&trid_to_committed_trn, sizeof(TRN*), LF_HASH_UNIQUE,
+ 0, 0, trn_get_hash_key, 0);
+ DBUG_PRINT("info", ("pthread_mutex_init LOCK_trn_list"));
+ pthread_mutex_init(&LOCK_trn_list, MY_MUTEX_INIT_FAST);
+ my_atomic_rwlock_init(&LOCK_short_trid_to_trn);
+ my_atomic_rwlock_init(&LOCK_pool);
+
+#ifdef NOT_USED
+ lockman_init(&maria_lockman, (loid_to_lo_func *)&short_trid_to_TRN, 10000);
+#endif
+
+ DBUG_RETURN(0);
+}
+
+/*
+ NOTE
+ this could only be called in the "idle" state - no transaction can be
+ running. See asserts below.
+*/
+void trnman_destroy()
+{
+ DBUG_ENTER("trnman_destroy");
+
+ if (short_trid_to_active_trn == NULL) /* trnman already destroyed */
+ DBUG_VOID_RETURN;
+ DBUG_ASSERT(trid_to_committed_trn.count == 0);
+ DBUG_ASSERT(trnman_active_transactions == 0);
+ DBUG_ASSERT(trnman_committed_transactions == 0);
+ DBUG_ASSERT(active_list_max.prev == &active_list_min);
+ DBUG_ASSERT(active_list_min.next == &active_list_max);
+ DBUG_ASSERT(committed_list_max.prev == &committed_list_min);
+ DBUG_ASSERT(committed_list_min.next == &committed_list_max);
+ while (pool)
+ {
+ TRN *trn= pool;
+ pool= pool->next;
+ DBUG_ASSERT(trn->locks.mutex == 0);
+ DBUG_ASSERT(trn->locks.cond == 0);
+ my_free((void *)trn, MYF(0));
+ }
+ lf_hash_destroy(&trid_to_committed_trn);
+ DBUG_PRINT("info", ("pthread_mutex_destroy LOCK_trn_list"));
+ pthread_mutex_destroy(&LOCK_trn_list);
+ my_atomic_rwlock_destroy(&LOCK_short_trid_to_trn);
+ my_atomic_rwlock_destroy(&LOCK_pool);
+ my_free((void *)(short_trid_to_active_trn+1), MYF(0));
+ short_trid_to_active_trn= NULL;
+#ifdef NOT_USED
+ lockman_destroy(&maria_lockman);
+#endif
+ DBUG_VOID_RETURN;
+}
+
+/*
+ NOTE
+ TrID is limited to 6 bytes. Initial value of the generator
+ is set by the recovery code - being read from the last checkpoint
+ (or 1 on a first run).
+*/
+static TrID new_trid()
+{
+ DBUG_ENTER("new_trid");
+ DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL);
+ DBUG_PRINT("info", ("safe_mutex_assert_owner LOCK_trn_list"));
+ safe_mutex_assert_owner(&LOCK_trn_list);
+ DBUG_RETURN(++global_trid_generator);
+}
+
+static void set_short_trid(TRN *trn)
+{
+ int i= (global_trid_generator + (intptr)trn) * 312089 % SHORT_TRID_MAX + 1;
+ for ( ; !trn->short_id ; i= 1)
+ {
+ my_atomic_rwlock_wrlock(&LOCK_short_trid_to_trn);
+ for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */
+ {
+ void *tmp= NULL;
+ if (short_trid_to_active_trn[i] == NULL &&
+ my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn))
+ {
+ trn->short_id= i;
+ break;
+ }
+ }
+ my_atomic_rwlock_wrunlock(&LOCK_short_trid_to_trn);
+ }
+}
+
+/*
+ DESCRIPTION
+ start a new transaction, allocate and initialize transaction object
+ mutex and cond will be used for lock waits
+*/
+
+TRN *trnman_new_trn(pthread_mutex_t *mutex, pthread_cond_t *cond,
+ void *stack_end)
+{
+ TRN *trn;
+ DBUG_ENTER("trnman_new_trn");
+
+ /*
+ we have a mutex, to do simple things under it - allocate a TRN,
+ increment trnman_active_transactions, set trn->min_read_from.
+
+ Note that all the above is fast. generating short_trid may be slow,
+ as it involves scanning a large array - so it's done outside of the
+ mutex.
+ */
+
+ DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+ pthread_mutex_lock(&LOCK_trn_list);
+
+ /* Allocating a new TRN structure */
+ trn= pool;
+ /*
+ Popping an unused TRN from the pool
+ (ABA isn't possible, we're behind a mutex
+ */
+ my_atomic_rwlock_wrlock(&LOCK_pool);
+ while (trn && !my_atomic_casptr((void **)&pool, (void **)&trn,
+ (void *)trn->next))
+ /* no-op */;
+ my_atomic_rwlock_wrunlock(&LOCK_pool);
+
+ /* Nothing in the pool ? Allocate a new one */
+ if (!trn)
+ {
+ /*
+ trn should be completely initalized at create time to allow
+ one to keep a known state on it.
+ (Like redo_lns, which is assumed to be 0 at start of row handling
+ and reset to zero before end of row handling)
+ */
+ trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL));
+ if (unlikely(!trn))
+ {
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return 0;
+ }
+ trnman_allocated_transactions++;
+ }
+ trn->pins= lf_hash_get_pins(&trid_to_committed_trn, stack_end);
+ if (!trn->pins)
+ {
+ trnman_free_trn(trn);
+ return 0;
+ }
+
+ trnman_active_transactions++;
+
+ trn->min_read_from= active_list_min.next->trid;
+
+ trn->trid= new_trid();
+ trn->short_id= 0;
+
+ trn->next= &active_list_max;
+ trn->prev= active_list_max.prev;
+ active_list_max.prev= trn->prev->next= trn;
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+
+ if (unlikely(!trn->min_read_from))
+ trn->min_read_from= trn->trid;
+
+ trn->commit_trid= 0;
+ trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0;
+
+ trn->locks.mutex= mutex;
+ trn->locks.cond= cond;
+ trn->locks.waiting_for= 0;
+ trn->locks.all_locks= 0;
+#ifdef NOT_USED
+ trn->locks.pins= lf_alloc_get_pins(&maria_lockman.alloc);
+#endif
+
+ trn->locked_tables= 0;
+
+ /*
+ only after the following function TRN is considered initialized,
+ so it must be done the last
+ */
+ set_short_trid(trn);
+
+ DBUG_RETURN(trn);
+}
+
+/*
+ remove a trn from the active list.
+ if necessary - move to committed list and set commit_trid
+
+ NOTE
+ Locks are released at the end. In particular, after placing the
+ transaction in commit list, and after setting commit_trid. It's
+ important, as commit_trid affects visibility. Locks don't affect
+ anything they simply delay execution of other threads - they could be
+ released arbitrarily late. In other words, when locks are released it
+ serves as a start banner for other threads, they start to run. So
+ everything they may need must be ready at that point.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+int trnman_end_trn(TRN *trn, my_bool commit)
+{
+ int res= 1;
+ TRN *free_me= 0;
+ LF_PINS *pins= trn->pins;
+ DBUG_ENTER("trnman_end_trn");
+
+ DBUG_ASSERT(trn->rec_lsn == 0);
+ /* if a rollback, all UNDO records should have been executed */
+ DBUG_ASSERT(commit || trn->undo_lsn == 0);
+ DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+ pthread_mutex_lock(&LOCK_trn_list);
+
+ /* remove from active list */
+ trn->next->prev= trn->prev;
+ trn->prev->next= trn->next;
+
+ /*
+ if trn was the oldest active transaction, now that it goes away there
+ may be committed transactions in the list which no active transaction
+ needs to bother about - clean up the committed list
+ */
+ if (trn->prev == &active_list_min)
+ {
+ uint free_me_count;
+ TRN *t;
+ for (t= committed_list_min.next, free_me_count= 0;
+ t->commit_trid < active_list_min.next->min_read_from;
+ t= t->next, free_me_count++) /* no-op */;
+
+ DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) ||
+ (t == committed_list_min.next && free_me_count == 0));
+ /* found transactions committed before the oldest active one */
+ if (t != committed_list_min.next)
+ {
+ free_me= committed_list_min.next;
+ committed_list_min.next= t;
+ t->prev->next= 0;
+ t->prev= &committed_list_min;
+ trnman_committed_transactions-= free_me_count;
+ }
+ }
+
+ /*
+ if transaction is committed and it was not the only active transaction -
+ add it to the committed list (which is used for read-from relation)
+ */
+ if (commit && active_list_min.next != &active_list_max)
+ {
+ trn->commit_trid= global_trid_generator;
+ trn->next= &committed_list_max;
+ trn->prev= committed_list_max.prev;
+ trnman_committed_transactions++;
+
+ res= lf_hash_insert(&trid_to_committed_trn, pins, &trn);
+ /*
+ By going on with life is res<0, we let other threads block on
+ our rows (because they will never see us committed in
+ trid_to_committed_trn) until they timeout. Though correct, this is not a
+ good situation:
+ - if connection reconnects and wants to check if its rows have been
+ committed, it will not be able to do that (it will just lock on them) so
+ connection stays permanently in doubt
+ - internal structures trid_to_committed_trn and committed_list are
+ desynchronized.
+ So we should take Maria down immediately, the two problems being
+ automatically solved at restart.
+ */
+ DBUG_ASSERT(res <= 0);
+ }
+ if (res)
+ {
+ /*
+ res == 1 means the condition in the if() above
+ was false.
+ res == -1 means lf_hash_insert failed
+ */
+ trn->next= free_me;
+ free_me= trn;
+ }
+ else
+ {
+ committed_list_max.prev= trn->prev->next= trn;
+ }
+ trnman_active_transactions--;
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+
+ /* the rest is done outside of a critical section */
+#ifdef NOT_USED
+ lockman_release_locks(&maria_lockman, &trn->locks);
+#endif
+ trn->locks.mutex= 0;
+ trn->locks.cond= 0;
+ my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn);
+ my_atomic_storeptr((void **)&short_trid_to_active_trn[trn->short_id], 0);
+ my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn);
+
+ /*
+ we, under the mutex, removed going-in-free_me transactions from the
+ active and committed lists, thus nobody else may see them when it scans
+ those lists, and thus nobody may want to free them. Now we don't
+ need a mutex to access free_me list
+ */
+ /* QQ: send them to the purge thread */
+ while (free_me)
+ {
+ TRN *t= free_me;
+ free_me= free_me->next;
+
+ /*
+ ignore OOM here. it's harmless, and there's nothing we could do, anyway
+ */
+ (void)lf_hash_delete(&trid_to_committed_trn, pins, &t->trid, sizeof(TrID));
+
+ trnman_free_trn(t);
+ }
+
+ lf_hash_put_pins(pins);
+#ifdef NOT_USED
+ lf_pinbox_put_pins(trn->locks.pins);
+#endif
+
+ DBUG_RETURN(res < 0);
+}
+
+/*
+ free a trn (add to the pool, that is)
+ note - we can never really free() a TRN if there's at least one other
+ running transaction - see, e.g., how lock waits are implemented in
+ lockman.c
+ The same is true for other lock-free data structures too. We may need some
+ kind of FLUSH command to reset them all - ensuring that no transactions are
+ running. It may even be called automatically on checkpoints if no
+ transactions are running.
+*/
+void trnman_free_trn(TRN *trn)
+{
+ TRN *tmp= pool;
+
+ my_atomic_rwlock_wrlock(&LOCK_pool);
+ do
+ {
+ /*
+ without this volatile cast gcc-3.4.4 moved the assignment
+ down after the loop at -O2
+ */
+ *(TRN * volatile *)&(trn->next)= tmp;
+ } while (!my_atomic_casptr((void **)&pool, (void **)&tmp, trn));
+ my_atomic_rwlock_wrunlock(&LOCK_pool);
+}
+
+/*
+ NOTE
+ here we access the hash in a lock-free manner.
+ It's safe, a 'found' TRN can never be freed/reused before we access it.
+ In fact, it cannot be freed before 'trn' ends, because a 'found' TRN
+ can only be removed from the hash when:
+ found->commit_trid < ALL (trn->min_read_from)
+ that is, at least
+ found->commit_trid < trn->min_read_from
+ but
+ found->trid >= trn->min_read_from
+ and
+ found->commit_trid > found->trid
+
+ RETURN
+ 1 can
+ 0 cannot
+ -1 error (OOM)
+*/
+int trnman_can_read_from(TRN *trn, TrID trid)
+{
+ TRN **found;
+ my_bool can;
+ LF_REQUIRE_PINS(3);
+
+ if (trid < trn->min_read_from)
+ return 1; /* can read */
+ if (trid > trn->trid)
+ return 0; /* cannot read */
+
+ found= lf_hash_search(&trid_to_committed_trn, trn->pins, &trid, sizeof(trid));
+ if (found == NULL)
+ return 0; /* not in the hash of committed transactions = cannot read */
+ if (found == MY_ERRPTR)
+ return -1;
+
+ can= (*found)->commit_trid < trn->trid;
+ lf_hash_search_unpin(trn->pins);
+ return can;
+}
+
+/* TODO: the stubs below are waiting for savepoints to be implemented */
+
+void trnman_new_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+void trnman_rollback_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+
+/**
+ @brief Allocates buffers and stores in them some info about transactions
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ The caller has the intention of doing checkpoints.
+
+ @param[out] str_act pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about active transactions
+ @param[out] str_com pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about committed transactions
+ @param[out] min_first_undo_lsn pointer to where the minimum
+ first_undo_lsn of all transactions will be put
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+ LSN *min_rec_lsn, LSN *min_first_undo_lsn)
+{
+ my_bool error;
+ TRN *trn;
+ char *ptr;
+ uint stored_transactions= 0;
+ LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX;
+ DBUG_ENTER("trnman_collect_transactions");
+
+ DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str));
+
+ /* validate the use of read_non_atomic() in general: */
+ compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8));
+ pthread_mutex_lock(&LOCK_trn_list);
+ str_act->length= 2 + /* number of active transactions */
+ LSN_STORE_SIZE + /* minimum of their rec_lsn */
+ (2 + /* short id */
+ 6 + /* long id */
+ LSN_STORE_SIZE + /* undo_lsn */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+ LSN_STORE_SIZE /* first_undo_lsn */
+ ) * trnman_active_transactions;
+ str_com->length= 4 + /* number of committed transactions */
+ (6 + /* long id */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+ LSN_STORE_SIZE /* first_undo_lsn */
+ ) * trnman_committed_transactions;
+ if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) ||
+ (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME)))))
+ goto err;
+ /* First, the active transactions */
+ ptr= str_act->str + 2 + LSN_STORE_SIZE;
+ for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+ {
+ /*
+ trns with a short trid of 0 are not even initialized, we can ignore
+ them. trns with undo_lsn==0 have done no writes, we can ignore them
+ too. XID not needed now.
+ */
+ uint sid;
+ LSN rec_lsn, undo_lsn, first_undo_lsn;
+ if ((sid= trn->short_id) == 0)
+ {
+ /*
+ Not even inited, has done nothing. Or it is the
+ dummy_transaction_object, which does only non-transactional
+ immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so
+ can be forgotten for Checkpoint.
+ */
+ continue;
+ }
+ /* needed for low-water mark calculation */
+ if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) &&
+ (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0))
+ minimum_rec_lsn= rec_lsn;
+ /*
+ trn may have logged REDOs but not yet UNDO, that's why we read rec_lsn
+ before deciding to ignore if undo_lsn==0.
+ */
+ if ((undo_lsn= trn->undo_lsn) == 0) /* trn can be forgotten */
+ continue;
+ stored_transactions++;
+ int2store(ptr, sid);
+ ptr+= 2;
+ int6store(ptr, trn->trid);
+ ptr+= 6;
+ lsn_store(ptr, undo_lsn); /* needed for rollback */
+ ptr+= LSN_STORE_SIZE;
+ /* needed for low-water mark calculation */
+ if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) &&
+ (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0))
+ minimum_first_undo_lsn= first_undo_lsn;
+ lsn_store(ptr, first_undo_lsn);
+ ptr+= LSN_STORE_SIZE;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ /* to know where purging should start (last delete of this trn) */
+ lsn_store(ptr, trn->undo_purge_lsn);
+ ptr+= LSN_STORE_SIZE;
+#endif
+ /**
+ @todo RECOVERY: add a comment explaining why we can dirtily read some
+ vars, inspired by the text of "assumption 8" in WL#3072
+ */
+ }
+ str_act->length= ptr - str_act->str; /* as we maybe over-estimated */
+ ptr= str_act->str;
+ DBUG_PRINT("info",("collected %u active transactions",
+ (uint)stored_transactions));
+ int2store(ptr, stored_transactions);
+ ptr+= 2;
+ /* this LSN influences how REDOs for any page can be ignored by Recovery */
+ lsn_store(ptr, minimum_rec_lsn);
+ /* one day there will also be a list of prepared transactions */
+ /* do the same for committed ones */
+ ptr= str_com->str;
+ int4store(ptr, trnman_committed_transactions);
+ ptr+= 4;
+ DBUG_PRINT("info",("collected %u committed transactions",
+ (uint)trnman_committed_transactions));
+ for (trn= committed_list_min.next; trn != &committed_list_max;
+ trn= trn->next)
+ {
+ LSN first_undo_lsn;
+ int6store(ptr, trn->trid);
+ ptr+= 6;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ lsn_store(ptr, trn->undo_purge_lsn);
+ ptr+= LSN_STORE_SIZE;
+#endif
+ first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn);
+ if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)
+ minimum_first_undo_lsn= first_undo_lsn;
+ lsn_store(ptr, first_undo_lsn);
+ ptr+= LSN_STORE_SIZE;
+ }
+ /*
+ TODO: if we see there exists no transaction (active and committed) we can
+ tell the lock-free structures to do some freeing (my_free()).
+ */
+ error= 0;
+ *min_rec_lsn= minimum_rec_lsn;
+ *min_first_undo_lsn= minimum_first_undo_lsn;
+ goto end;
+err:
+ error= 1;
+end:
+ pthread_mutex_unlock(&LOCK_trn_list);
+ DBUG_RETURN(error);
+}
+
+
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid)
+{
+ TrID old_trid_generator= global_trid_generator;
+ TRN *trn;
+ DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+ if (unlikely((trn= trnman_new_trn(NULL, NULL, NULL)) == NULL))
+ return NULL;
+ /* deallocate excessive allocations of trnman_new_trn() */
+ global_trid_generator= old_trid_generator;
+ set_if_bigger(global_trid_generator, longid);
+ short_trid_to_active_trn[trn->short_id]= 0;
+ DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL);
+ short_trid_to_active_trn[shortid]= trn;
+ trn->trid= longid;
+ trn->short_id= shortid;
+ return trn;
+}
+
+
+TRN *trnman_get_any_trn()
+{
+ TRN *trn= active_list_min.next;
+ return (trn != &active_list_max) ? trn : NULL;
+}
diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h
new file mode 100644
index 00000000000..fce02d9ab89
--- /dev/null
+++ b/storage/maria/trnman.h
@@ -0,0 +1,59 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _trnman_h
+#define _trnman_h
+
+C_MODE_START
+
+#include <lf.h>
+#include "lockman.h"
+#include "trnman_public.h"
+#include "ma_loghandler_lsn.h"
+
+/*
+ trid - 6 uchar transaction identifier. Assigned when a transaction
+ is created. Transaction can always be identified by its trid,
+ even after transaction has ended.
+
+ short_trid - 2-byte transaction identifier, identifies a running
+ transaction, is reassigned when transaction ends.
+*/
+
+/*
+ short transaction id is at the same time its identifier
+ for a lock manager - its lock owner identifier (loid)
+*/
+
+#define short_id locks.loid
+
+struct st_transaction
+{
+ LOCK_OWNER locks; /* must be the first! see short_trid_to_TRN() */
+ LF_PINS *pins;
+ TrID trid, min_read_from, commit_trid;
+ TRN *next, *prev;
+ LSN rec_lsn, undo_lsn;
+ LSN_WITH_FLAGS first_undo_lsn;
+ uint locked_tables;
+ /* Note! if locks.loid is 0, trn is NOT initialized */
+};
+
+#define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000)
+
+C_MODE_END
+
+#endif
+
diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h
new file mode 100644
index 00000000000..97b492c3a57
--- /dev/null
+++ b/storage/maria/trnman_public.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+/*
+ External definitions for trnman.h
+ We need to split this into two files as gcc 4.1.2 gives error if it tries
+ to include my_atomic.h in C++ code.
+*/
+
+#ifndef _trnman_public_h
+#define _trnman_public_h
+
+#include "ma_loghandler_lsn.h"
+
+C_MODE_START
+typedef uint64 TrID; /* our TrID is 6 bytes */
+typedef struct st_transaction TRN;
+
+#define SHORT_TRID_MAX 65535
+
+extern uint trnman_active_transactions, trnman_allocated_transactions;
+extern TRN dummy_transaction_object;
+
+int trnman_init(TrID);
+void trnman_destroy(void);
+TRN *trnman_new_trn(pthread_mutex_t *, pthread_cond_t *, void *);
+int trnman_end_trn(TRN *trn, my_bool commit);
+#define trnman_commit_trn(T) trnman_end_trn(T, TRUE)
+#define trnman_abort_trn(T) trnman_end_trn(T, FALSE)
+#define trnman_rollback_trn(T) trnman_end_trn(T, FALSE)
+void trnman_free_trn(TRN *trn);
+int trnman_can_read_from(TRN *trn, TrID trid);
+void trnman_new_statement(TRN *trn);
+void trnman_rollback_statement(TRN *trn);
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+ LSN *min_rec_lsn,
+ LSN *min_first_undo_lsn);
+
+uint trnman_increment_locked_tables(TRN *trn);
+uint trnman_decrement_locked_tables(TRN *trn);
+my_bool trnman_has_locked_tables(TRN *trn);
+void trnman_reset_locked_tables(TRN *trn);
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid);
+TRN *trnman_get_any_trn();
+
+C_MODE_END
+#endif
diff --git a/storage/maria/unittest/Makefile.am b/storage/maria/unittest/Makefile.am
new file mode 100644
index 00000000000..4631b436b0b
--- /dev/null
+++ b/storage/maria/unittest/Makefile.am
@@ -0,0 +1,97 @@
+# Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+AM_CPPFLAGS = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+ -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+INCLUDES = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+ -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Maria).
+LDADD= $(top_builddir)/unittest/mytap/libmytap.a \
+ $(top_builddir)/storage/maria/libmaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+noinst_PROGRAMS = ma_control_file-t trnman-t lockman2-t \
+ ma_pagecache_single_1k-t ma_pagecache_single_8k-t \
+ ma_pagecache_single_64k-t-big \
+ ma_pagecache_consist_1k-t-big \
+ ma_pagecache_consist_64k-t-big \
+ ma_pagecache_consist_1kHC-t-big \
+ ma_pagecache_consist_64kHC-t-big \
+ ma_pagecache_consist_1kRD-t-big \
+ ma_pagecache_consist_64kRD-t-big \
+ ma_pagecache_consist_1kWR-t-big \
+ ma_pagecache_consist_64kWR-t-big \
+ ma_test_loghandler-t \
+ ma_test_loghandler_multigroup-t \
+ ma_test_loghandler_multithread-t \
+ ma_test_loghandler_pagecache-t \
+ ma_test_loghandler_long-t-big \
+ ma_test_loghandler_noflush-t \
+ ma_test_loghandler_first_lsn-t \
+ ma_test_loghandler_max_lsn-t \
+ ma_test_loghandler_purge-t
+
+ma_test_loghandler_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_multigroup_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_multithread_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_pagecache_t_SOURCES = ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_long_t_big_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_long_t_big_CPPFLAGS = -DLONG_LOG_TEST
+ma_test_loghandler_noflush_t_SOURCES = ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_first_lsn_t_SOURCES = ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_max_lsn_t_SOURCES = ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c
+ma_test_loghandler_purge_t_SOURCES = ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c
+
+ma_pagecache_single_src = ma_pagecache_single.c test_file.c test_file.h
+ma_pagecache_consist_src = ma_pagecache_consist.c test_file.c test_file.h
+ma_pagecache_common_cppflags = -DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN
+
+ma_pagecache_single_1k_t_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_8k_t_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_64k_t_big_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024
+ma_pagecache_single_8k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=8192
+ma_pagecache_single_64k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536
+
+ma_pagecache_consist_1k_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024
+ma_pagecache_consist_64k_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536
+
+ma_pagecache_consist_1kHC_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kHC_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY
+ma_pagecache_consist_64kHC_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kHC_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY
+
+ma_pagecache_consist_1kRD_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kRD_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_READERS
+ma_pagecache_consist_64kRD_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kRD_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_READERS
+
+ma_pagecache_consist_1kWR_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kWR_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_WRITERS
+ma_pagecache_consist_64kWR_t_big_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kWR_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_WRITERS
+
+# the generic lock manager may not be used in the end and lockman1-t crashes,
+# so we don't build lockman-t and lockman1-t
+CLEANFILES = maria_log_control page_cache_test_file_1 \
+ maria_log.????????
+
diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c
new file mode 100644
index 00000000000..8c0f71175e7
--- /dev/null
+++ b/storage/maria/unittest/lockman-t.c
@@ -0,0 +1,309 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ lockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+
+#define Nlos 100
+LOCK_OWNER loarray[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKMAN lockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lockhash(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+ return loarray+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman)
+#define test_lock(O, R, L, S, RES) \
+ ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lockhash(&lockman)
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK);
+
+void test_lockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[4]= {
+ "DIDN'T GET THE LOCK",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ LOCK_OWNER *lo;
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo= loid2lo(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ { /* table lock */
+ res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case DIDNT_GET_THE_LOCK:
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ }
+
+ lockman_release_locks(&lockman, lo);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main()
+{
+ int i;
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(35);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ lockman_init(&lockman, &loid2lo, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+ loarray[i].all_locks= 0;
+ loarray[i].waiting_for= 0;
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+ loarray[i].mutex= &mutexes[i];
+ loarray[i].cond= &conds[i];
+ loarray[i].loid= i+1;
+ }
+
+ test_lockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ lockman_release_locks(&lockman, &loarray[i]);
+ pthread_mutex_destroy(loarray[i].mutex);
+ pthread_cond_destroy(loarray[i].cond);
+ lf_pinbox_put_pins(loarray[i].pins);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ lockman_destroy(&lockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c
new file mode 100644
index 00000000000..41a1f0fd2f4
--- /dev/null
+++ b/storage/maria/unittest/lockman1-t.c
@@ -0,0 +1,335 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ lockman for row locks, tablockman for table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 10
+LOCK_OWNER loarray[Nlos];
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+LOCKMAN lockman;
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+ return loarray+loid-1;
+}
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+ return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES) \
+ ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[]= {
+ "DIDN'T GET THE LOCK",
+ "OUT OF MEMORY",
+ "DEADLOCK",
+ "LOCK TIMEOUT",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls);
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo= loid2lo(loid); lo1= loid2lo1(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ { /* table lock */
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ }
+ }
+
+ lockman_release_locks(&lockman, lo);
+ tablockman_release_locks(&tablockman, lo1);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main()
+{
+ int i;
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(35);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ lockman_init(&lockman, &loid2lo, 50);
+ tablockman_init(&tablockman, &loid2lo1, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+
+ loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+ loarray[i].all_locks= 0;
+ loarray[i].waiting_for= 0;
+ loarray[i].mutex= &mutexes[i];
+ loarray[i].cond= &conds[i];
+ loarray[i].loid= i+1;
+
+ loarray1[i].active_locks= 0;
+ loarray1[i].waiting_lock= 0;
+ loarray1[i].waiting_for= 0;
+ loarray1[i].mutex= &mutexes[i];
+ loarray1[i].cond= &conds[i];
+ loarray1[i].loid= i+1;
+ }
+
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_init_locked_table(ltarray+i, Nlos);
+ }
+
+ test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ lockman_release_locks(&lockman, &loarray[i]);
+ pthread_mutex_destroy(loarray[i].mutex);
+ pthread_cond_destroy(loarray[i].cond);
+ lf_pinbox_put_pins(loarray[i].pins);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ lockman_destroy(&lockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c
new file mode 100644
index 00000000000..01af1a03d22
--- /dev/null
+++ b/storage/maria/unittest/lockman2-t.c
@@ -0,0 +1,361 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ tablockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 110
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+ return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES) \
+ ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IS);
+ lock_conflict(2, 1, X);
+ lock_conflict(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_a(1, 1, S);
+ lock_conflict(2, 1, IX);
+ lock_conflict(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+#ifdef NOT_USED_YET
+ TABLE_LOCK_OWNER backup= *lo;
+#endif
+
+ tablockman_release_locks(lm, lo);
+#ifdef NOT_USED_YET
+ pthread_mutex_destroy(lo->mutex);
+ pthread_cond_destroy(lo->cond);
+ bzero(lo, sizeof(*lo));
+
+ lo->mutex= backup.mutex;
+ lo->cond= backup.cond;
+ lo->loid= backup.loid;
+ pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(lo->cond, 0);
+#endif
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+const char *res2str[]= {
+ 0,
+ "OUT OF MEMORY",
+ "DEADLOCK",
+ "LOCK TIMEOUT",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ TABLE_LOCK_OWNER *lo1;
+ DBUG_ASSERT(Ntables <= Ntbls);
+ DBUG_ASSERT(Nrows + Ntables <= Ntbls);
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo1= loid2lo1(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ /* three prime numbers */
+ x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767));
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ {
+ /* table lock */
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table,
+ lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ }
+ }
+
+ reinit_tlo(&tablockman, lo1);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main()
+{
+ int i;
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(40);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ tablockman_init(&tablockman, &loid2lo1, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+
+ loarray1[i].active_locks= 0;
+ loarray1[i].waiting_lock= 0;
+ loarray1[i].waiting_for= 0;
+ loarray1[i].mutex= &mutexes[i];
+ loarray1[i].cond= &conds[i];
+ loarray1[i].loid= i+1;
+ }
+
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_init_locked_table(ltarray+i, Nlos);
+ }
+
+ test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+#if 0
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+#endif
+ for (i= 0; i < Nlos; i++)
+ {
+ tablockman_release_locks(&tablockman, &loarray1[i]);
+ pthread_mutex_destroy(loarray1[i].mutex);
+ pthread_cond_destroy(loarray1[i].cond);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_destroy_locked_table(ltarray+i);
+ }
+ tablockman_destroy(&tablockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c
new file mode 100644
index 00000000000..a7472361dad
--- /dev/null
+++ b/storage/maria/unittest/ma_control_file-t.c
@@ -0,0 +1,447 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Unit test of the control file module of the Maria engine WL#3234 */
+
+/*
+ Note that it is not possible to test the durability of the write (can't
+ pull the plug programmatically :)
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <tap.h>
+
+#ifndef WITH_MARIA_STORAGE_ENGINE
+/*
+ If Maria is not compiled in, normally we don't come to building this test.
+*/
+#error "Maria engine is not compiled in, test cannot be built"
+#endif
+
+#include "maria.h"
+#include "../../../storage/maria/maria_def.h"
+#include <my_getopt.h>
+
+char file_name[FN_REFLEN];
+
+/* The values we'll set and expect the control file module to return */
+LSN expect_checkpoint_lsn;
+uint32 expect_logno;
+
+static int delete_file(myf my_flags);
+/*
+ Those are test-specific wrappers around the module's API functions: after
+ calling the module's API functions they perform checks on the result.
+*/
+static int close_file(); /* wraps ma_control_file_end */
+static int create_or_open_file(); /* wraps ma_control_file_open_or_create */
+static int write_file(); /* wraps ma_control_file_write_and_force */
+
+/* Tests */
+static int test_one_log();
+static int test_five_logs();
+static int test_3_checkpoints_and_2_logs();
+static int test_binary_content();
+static int test_start_stop();
+static int test_2_open_and_2_close();
+static int test_bad_magic_string();
+static int test_bad_checksum();
+static int test_bad_size();
+
+/* Utility */
+static int verify_module_values_match_expected();
+static int verify_module_values_are_impossible();
+static void usage();
+static void get_options(int argc, char *argv[]);
+
+/*
+ If "expr" is FALSE, this macro will make the function print a diagnostic
+ message and immediately return 1.
+ This is inspired from assert() but does not crash the binary (sometimes we
+ may want to see how other tests go even if one fails).
+ RET_ERR means "return error".
+*/
+
+#define RET_ERR_UNLESS(expr) \
+ {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); return 1;}}
+
+
+int main(int argc,char *argv[])
+{
+ MY_INIT(argv[0]);
+ maria_data_root= ".";
+
+ plan(9);
+
+ diag("Unit tests for control file");
+
+ get_options(argc,argv);
+
+ diag("Deleting control file at startup, if there is an old one");
+ RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */
+
+ diag("Tests of normal conditions");
+ ok(0 == test_one_log(), "test of creating one log");
+ ok(0 == test_five_logs(), "test of creating five logs");
+ ok(0 == test_3_checkpoints_and_2_logs(),
+ "test of creating three checkpoints and two logs");
+ ok(0 == test_binary_content(), "test of the binary content of the file");
+ ok(0 == test_start_stop(), "test of multiple starts and stops");
+ diag("Tests of abnormal conditions");
+ ok(0 == test_2_open_and_2_close(),
+ "test of two open and two close (strange call sequence)");
+ ok(0 == test_bad_magic_string(), "test of bad magic string");
+ ok(0 == test_bad_checksum(), "test of bad checksum");
+ ok(0 == test_bad_size(), "test of too small/big file");
+
+ return exit_status();
+}
+
+
+static int delete_file(myf my_flags)
+{
+ RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) != NullS);
+ /*
+ Maybe file does not exist, ignore error.
+ The error will however be printed on stderr.
+ */
+ my_delete(file_name, my_flags);
+ expect_checkpoint_lsn= LSN_IMPOSSIBLE;
+ expect_logno= FILENO_IMPOSSIBLE;
+
+ return 0;
+}
+
+/*
+ Verifies that global values last_checkpoint_lsn and last_logno (belonging
+ to the module) match what we expect.
+*/
+static int verify_module_values_match_expected()
+{
+ RET_ERR_UNLESS(last_logno == expect_logno);
+ RET_ERR_UNLESS(last_checkpoint_lsn ==
+ expect_checkpoint_lsn);
+ return 0;
+}
+
+
+/*
+ Verifies that global values last_checkpoint_lsn and last_logno (belonging
+ to the module) are impossible (this is used when the file has been closed).
+*/
+static int verify_module_values_are_impossible()
+{
+ RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE);
+ RET_ERR_UNLESS(last_checkpoint_lsn ==
+ LSN_IMPOSSIBLE);
+ return 0;
+}
+
+
+static int close_file()
+{
+ /* Simulate shutdown */
+ ma_control_file_end();
+ /* Verify amnesia */
+ RET_ERR_UNLESS(verify_module_values_are_impossible() == 0);
+ return 0;
+}
+
+static int create_or_open_file()
+{
+ RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == CONTROL_FILE_OK);
+ /* Check that the module reports expected information */
+ RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+ return 0;
+}
+
+static int write_file(const LSN checkpoint_lsn,
+ uint32 logno,
+ uint objs_to_write)
+{
+ RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno,
+ objs_to_write) == 0);
+ /* Check that the module reports expected information */
+ RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+ return 0;
+}
+
+static int test_one_log()
+{
+ uint objs_to_write;
+
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO;
+ expect_logno= 123;
+ RET_ERR_UNLESS(write_file(LSN_IMPOSSIBLE,
+ expect_logno,
+ objs_to_write) == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_five_logs()
+{
+ uint objs_to_write;
+ uint i;
+
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO;
+ expect_logno= 100;
+ for (i= 0; i<5; i++)
+ {
+ expect_logno*= 3;
+ RET_ERR_UNLESS(write_file(LSN_IMPOSSIBLE, expect_logno,
+ objs_to_write) == 0);
+ }
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_3_checkpoints_and_2_logs()
+{
+ uint objs_to_write;
+ /*
+ Simulate one checkpoint, one log creation, two checkpoints, one
+ log creation.
+ */
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN;
+ expect_checkpoint_lsn= MAKE_LSN(5, 10000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn,
+ expect_logno, objs_to_write) == 0);
+
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO;
+ expect_logno= 17;
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn,
+ expect_logno, objs_to_write) == 0);
+
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN;
+ expect_checkpoint_lsn= MAKE_LSN(17, 20000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn,
+ expect_logno, objs_to_write) == 0);
+
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN;
+ expect_checkpoint_lsn= MAKE_LSN(17, 45000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn,
+ expect_logno, objs_to_write) == 0);
+
+ objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO;
+ expect_logno= 19;
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn,
+ expect_logno, objs_to_write) == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_binary_content()
+{
+ uint i;
+ int fd;
+
+ /*
+ TEST4: actually check by ourselves the content of the file.
+ Note that constants (offsets) are hard-coded here, precisely to prevent
+ someone from changing them in the control file module and breaking
+ backward-compatibility.
+ TODO: when we reach the format-freeze state, we may even just do a
+ comparison with a raw binary string, to not depend on any uint4korr
+ future change/breakage.
+ */
+
+ char buffer[23];
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_read(fd, buffer, 23, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ i= uint3korr(buffer+12);
+ RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn));
+ i= uint4korr(buffer+15);
+ RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn));
+ i= uint4korr(buffer+19);
+ RET_ERR_UNLESS(i == last_logno);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_start_stop()
+{
+ /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */
+
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_2_open_and_2_close()
+{
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+
+static int test_bad_magic_string()
+{
+ char buffer[4];
+ int fd;
+
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ /* Corrupt magic string */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_pwrite(fd, "papa", 4, 0, MYF(MY_FNABP | MY_WME)) == 0);
+
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) ==
+ CONTROL_FILE_BAD_MAGIC_STRING);
+ /* Restore magic string */
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_bad_checksum()
+{
+ char buffer[4];
+ int fd;
+
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ /* Corrupt checksum */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pread(fd, buffer, 1, 8, MYF(MY_FNABP | MY_WME)) == 0);
+ buffer[0]+= 3; /* mangle checksum */
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 8, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) ==
+ CONTROL_FILE_BAD_CHECKSUM);
+ /* Restore checksum */
+ buffer[0]-= 3;
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 4, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+ return 0;
+}
+
+
+static int test_bad_size()
+{
+ char buffer[]="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+ int fd;
+
+ /* A too short file */
+ RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR | O_CREAT,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) ==
+ CONTROL_FILE_TOO_SMALL);
+ RET_ERR_UNLESS(my_write(fd, buffer, 30, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == CONTROL_FILE_TOO_BIG);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+ /* Leave a correct control file */
+ RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ return 0;
+}
+
+
+static struct my_option my_long_options[] =
+{
+#ifndef DBUG_OFF
+ {"debug", '#', "Debug log.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"help", '?', "Display help and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version number and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static void version()
+{
+ printf("ma_control_file_test: unit test for the control file "
+ "module of the Maria storage engine. Ver 1.0 \n");
+}
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch(optid) {
+ case 'V':
+ version();
+ exit(0);
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case '?':
+ version();
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options,
+ get_one_option)))
+ exit(ho_error);
+
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("Usage: %s [options]\n\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c
new file mode 100644
index 00000000000..c5917764b9b
--- /dev/null
+++ b/storage/maria/unittest/ma_maria_log_cleanup.c
@@ -0,0 +1,45 @@
+#include "../maria_def.h"
+#include <my_dir.h>
+
+my_bool maria_log_remove()
+{
+ MY_DIR *dirp;
+ uint i;
+ MY_STAT stat_buff;
+ char file_name[FN_REFLEN];
+
+ /* Removes control file */
+ if (fn_format(file_name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) == NullS)
+ return 1;
+ if (my_stat(file_name, &stat_buff, MYF(0)) &&
+ my_delete(file_name, MYF(MY_WME)) != 0)
+ return 1;
+
+ /* Finds and removes transaction log files */
+ if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT))))
+ return 1;
+
+ for (i= 0; i < dirp->number_off_files; i++)
+ {
+ char *file= dirp->dir_entry[i].name;
+ if (strncmp(file, "maria_log.", 10) == 0 &&
+ file[10] >= '0' && file[10] <= '9' &&
+ file[11] >= '0' && file[11] <= '9' &&
+ file[12] >= '0' && file[12] <= '9' &&
+ file[13] >= '0' && file[13] <= '9' &&
+ file[14] >= '0' && file[14] <= '9' &&
+ file[15] >= '0' && file[15] <= '9' &&
+ file[16] >= '0' && file[16] <= '9' &&
+ file[17] >= '0' && file[17] <= '9' &&
+ file[18] == '\0')
+ {
+ if (fn_format(file_name, file,
+ maria_data_root, "", MYF(MY_WME)) == NullS ||
+ my_delete(file_name, MYF(MY_WME)) != 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c
new file mode 100644
index 00000000000..54491a09c3b
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_consist.c
@@ -0,0 +1,459 @@
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+ Use diag() instead of fprintf(stderr). Use ok() and plan().
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+#ifdef TEST_HIGH_CONCURENCY
+static uint number_of_readers= 10;
+static uint number_of_writers= 20;
+static uint number_of_tests= 30000;
+static uint record_length_limit= PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#else /*TEST_HIGH_CONCURENCY*/
+#ifdef TEST_READERS
+static uint number_of_readers= 10;
+static uint number_of_writers= 1;
+static uint number_of_tests= 30000;
+static uint record_length_limit= PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#else /*TEST_READERS*/
+#ifdef TEST_WRITERS
+static uint number_of_readers= 0;
+static uint number_of_writers= 10;
+static uint number_of_tests= 30000;
+static uint record_length_limit= PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#else /*TEST_WRITERS*/
+static uint number_of_readers= 10;
+static uint number_of_writers= 10;
+static uint number_of_tests= 50000;
+static uint record_length_limit= PAGE_SIZE/200;
+static uint number_of_pages= 20000;
+static uint flush_divider= 1000;
+#endif /*TEST_WRITERS*/
+#endif /*TEST_READERS*/
+#endif /*TEST_HIGH_CONCURENCY*/
+
+
+/*
+ Get pseudo-random length of the field in (0;limit)
+
+ SYNOPSYS
+ get_len()
+ limit limit for generated value
+
+ RETURN
+ length where length >= 0 & length < limit
+*/
+
+static uint get_len(uint limit)
+{
+ uint32 rec_len;
+ do
+ {
+ rec_len= random() /
+ (RAND_MAX / limit);
+ } while (rec_len >= limit || rec_len == 0);
+ return rec_len;
+}
+
+
+/* check page consistency */
+uint check_page(uchar *buff, ulong offset, int page_locked, int page_no,
+ int tag)
+{
+ uint end= sizeof(uint);
+ uint num= *((uint *)buff);
+ uint i;
+ DBUG_ENTER("check_page");
+
+ for (i= 0; i < num; i++)
+ {
+ uint len= *((uint *)(buff + end));
+ uint j;
+ end+= sizeof(uint) + sizeof(uint);
+ if (len + end > PAGE_SIZE)
+ {
+ diag("incorrect field header #%u by offset %lu\n", i, offset + end);
+ goto err;
+ }
+ for(j= 0; j < len; j++)
+ {
+ if (buff[end + j] != (uchar)((i+1) % 256))
+ {
+ diag("incorrect %lu byte\n", offset + end + j);
+ goto err;
+ }
+ }
+ end+= len;
+ }
+ for(i= end; i < PAGE_SIZE; i++)
+ {
+ if (buff[i] != 0)
+ {
+ int h;
+ DBUG_PRINT("err",
+ ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+ offset + i, offset, i, page_no,
+ (page_locked ? "locked" : "unlocked"),
+ end, num, tag));
+ diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+ offset + i, offset, i, page_no,
+ (page_locked ? "locked" : "unlocked"),
+ end, num, tag);
+ h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0));
+ my_pwrite(h, (uchar*) buff, PAGE_SIZE, 0, MYF(0));
+ my_close(h, MYF(0));
+ goto err;
+ }
+ }
+ DBUG_RETURN(end);
+err:
+ DBUG_PRINT("err", ("try to flush"));
+ if (page_locked)
+ {
+ pagecache_delete(&pagecache, &file1, page_no,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, 1);
+ }
+ else
+ {
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+ }
+ exit(1);
+}
+
+void put_rec(uchar *buff, uint end, uint len, uint tag)
+{
+ uint i;
+ uint num= *((uint *)buff);
+ if (!len)
+ len= 1;
+ if (end + sizeof(uint)*2 + len > PAGE_SIZE)
+ return;
+ *((uint *)(buff + end))= len;
+ end+= sizeof(uint);
+ *((uint *)(buff + end))= tag;
+ end+= sizeof(uint);
+ num++;
+ *((uint *)buff)= num;
+ *((uint*)(buff + end))= len;
+ for (i= end; i < (len + end); i++)
+ {
+ buff[i]= (uchar) num % 256;
+ }
+}
+
+/*
+ Recreate and reopen a file for test
+
+ SYNOPSIS
+ reset_file()
+ file File to reset
+ file_name Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE file, char *file_name)
+{
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag("Got error during %s closing from close() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+ my_delete(file_name, MYF(0));
+ if ((file.file= my_open(file_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag("Got error during %s creation from open() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+}
+
+
+void reader(int num)
+{
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ uint i;
+
+ for (i= 0; i < number_of_tests; i++)
+ {
+ uint page= get_len(number_of_pages);
+ pagecache_read(&pagecache, &file1, page, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ check_page(buffr, page * PAGE_SIZE, 0, page, -num);
+ if (i % 500 == 0)
+ printf("reader%d: %d\n", num, i);
+
+ }
+ printf("reader%d: done\n", num);
+ free(buffr);
+}
+
+
+void writer(int num)
+{
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ uint i;
+
+ for (i= 0; i < number_of_tests; i++)
+ {
+ uint end;
+ uint page= get_len(number_of_pages);
+ pagecache_read(&pagecache, &file1, page, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ end= check_page(buffr, page * PAGE_SIZE, 1, page, num);
+ put_rec(buffr, end, get_len(record_length_limit), num);
+ pagecache_write(&pagecache, &file1, page, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+
+ if (i % flush_divider == 0)
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ if (i % 500 == 0)
+ printf("writer%d: %d\n", num, i);
+ }
+ printf("writer%d: done\n", num);
+ free(buffr);
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+ int param=*((int*) arg);
+
+ my_thread_init();
+ DBUG_ENTER("test_reader");
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ reader(param);
+
+ DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ DBUG_RETURN(0);
+}
+
+static void *test_thread_writer(void *arg)
+{
+ int param=*((int*) arg);
+
+ my_thread_init();
+ DBUG_ENTER("test_writer");
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ writer(param);
+
+ DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ DBUG_RETURN(0);
+}
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0)
+ {
+ fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ my_pwrite(file1.file, "test file", 9, 0, MYF(0));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ fprintf(stderr, "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ fprintf(stderr,
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+ {
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ uint i;
+ memset(buffr, '\0', PAGE_SIZE);
+ for (i= 0; i < number_of_pages; i++)
+ {
+ pagecache_write(&pagecache, &file1, i, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ }
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ free(buffr);
+ }
+ if ((error= pthread_mutex_lock(&LOCK_thread_count)))
+ {
+ fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_lock (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ while (number_of_readers != 0 || number_of_writers != 0)
+ {
+ if (number_of_readers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_readers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+ (void*) param)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_readers--;
+ }
+ if (number_of_writers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_writers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ }
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ if ((error= pthread_mutex_lock(&LOCK_thread_count)))
+ fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_lock\n",error);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+ fprintf(stderr,"COND_thread_count: %d from pthread_cond_wait\n",error);
+ }
+ if ((error= pthread_mutex_unlock(&LOCK_thread_count)))
+ fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_unlock\n",error);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ fprintf(stderr, "Got error during file1 closing from close() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ /*my_delete(file1_name, MYF(0));*/
+ my_end(0);
+
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+
+ DBUG_PRINT("info", ("Program end"));
+
+ DBUG_RETURN(exit_status());
+}
diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c
new file mode 100644
index 00000000000..8add95e8a36
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_single.c
@@ -0,0 +1,588 @@
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+ Use diag() instead of fprintf(stderr).
+*/
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (PAGE_SIZE*1024*10)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+/*
+ File contance descriptors
+*/
+static struct file_desc simple_read_write_test_file[]=
+{
+ {PAGE_SIZE, '\1'},
+ { 0, 0}
+};
+static struct file_desc simple_read_change_write_read_test_file[]=
+{
+ {PAGE_SIZE/2, '\65'},
+ {PAGE_SIZE/2, '\1'},
+ { 0, 0}
+};
+static struct file_desc simple_pin_test_file1[]=
+{
+ {PAGE_SIZE*2, '\1'},
+ { 0, 0}
+};
+static struct file_desc simple_pin_test_file2[]=
+{
+ {PAGE_SIZE/2, '\1'},
+ {PAGE_SIZE/2, (unsigned char)129},
+ {PAGE_SIZE, '\1'},
+ { 0, 0}
+};
+static struct file_desc simple_delete_forget_test_file[]=
+{
+ {PAGE_SIZE, '\1'},
+ { 0, 0}
+};
+static struct file_desc simple_delete_flush_test_file[]=
+{
+ {PAGE_SIZE, '\2'},
+ { 0, 0}
+};
+
+
+/*
+ Recreate and reopen a file for test
+
+ SYNOPSIS
+ reset_file()
+ file File to reset
+ file_name Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE file, char *file_name)
+{
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag("Got error during %s closing from close() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+ my_delete(file_name, MYF(0));
+ if ((file.file= my_open(file_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag("Got error during %s creation from open() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+}
+
+/*
+ Write then read page, check file on disk
+*/
+
+int simple_read_write_test()
+{
+ unsigned char *buffw= malloc(PAGE_SIZE);
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_read_write_test");
+ bfill(buffw, PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ ok((res= test(memcmp(buffr, buffw, PAGE_SIZE) == 0)),
+ "Simple write-read page ");
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res&= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE,
+ simple_read_write_test_file))),
+ "Simple write-read page file");
+ if (res)
+ reset_file(file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Prepare page, then read (and lock), change (write new value and unlock),
+ then check the page in the cache and on the disk
+*/
+int simple_read_change_write_read_test()
+{
+ unsigned char *buffw= malloc(PAGE_SIZE);
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_read_change_write_read_test");
+ /* prepare the file */
+ bfill(buffw, PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ /* test */
+ pagecache_read(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ bfill(buffw, PAGE_SIZE/2, '\65');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+
+ pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ ok((res= test(memcmp(buffr, buffw, PAGE_SIZE) == 0)),
+ "Simple read-change-write-read page ");
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res&= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE,
+ simple_read_change_write_read_test_file))),
+ "Simple read-change-write-read page file");
+ if (res)
+ reset_file(file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Prepare page, read page 0 (and pin) then write page 1 and page 0.
+ Flush the file (shold flush only page 1 and return 1 (page 0 is
+ still pinned).
+ Check file on the disk.
+ Unpin and flush.
+ Check file on the disk.
+*/
+int simple_pin_test()
+{
+ unsigned char *buffw= malloc(PAGE_SIZE);
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_pin_test");
+ /* prepare the file */
+ bfill(buffw, PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ /* test */
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("error in flush_pagecache_blocks\n");
+ exit(1);
+ }
+ pagecache_read(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ pagecache_write(&pagecache, &file1, 1, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ bfill(buffw + PAGE_SIZE/2, PAGE_SIZE/2, ((unsigned char) 129));
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_TO_READ,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ /*
+ We have to get error because one page of the file is pinned,
+ other page should be flushed
+ */
+ if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Did not get error in flush_pagecache_blocks\n");
+ res= 0;
+ goto err;
+ }
+ ok((res= test(test_file(file1, file1_name, PAGE_SIZE*2, PAGE_SIZE*2,
+ simple_pin_test_file1))),
+ "Simple pin page file with pin");
+ pagecache_unlock(&pagecache,
+ &file1,
+ 0,
+ PAGECACHE_LOCK_READ_UNLOCK,
+ PAGECACHE_UNPIN,
+ 0, 0);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks\n");
+ res= 0;
+ goto err;
+ }
+ ok((res&= test(test_file(file1, file1_name, PAGE_SIZE*2, PAGE_SIZE,
+ simple_pin_test_file2))),
+ "Simple pin page result file");
+ if (res)
+ reset_file(file1, file1_name);
+err:
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+/*
+ Prepare page, write new value, then delete page from cache without flush,
+ on the disk should be page with old content written during preparation
+*/
+
+int simple_delete_forget_test()
+{
+ unsigned char *buffw= malloc(PAGE_SIZE);
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_delete_forget_test");
+ /* prepare the file */
+ bfill(buffw, PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ /* test */
+ bfill(buffw, PAGE_SIZE, '\2');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ pagecache_delete(&pagecache, &file1, 0,
+ PAGECACHE_LOCK_WRITE, 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE,
+ simple_delete_forget_test_file))),
+ "Simple delete-forget page file");
+ if (res)
+ reset_file(file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+/*
+ Prepare page with locking, write new content to the page,
+ delete page with flush and on existing lock,
+ check that page on disk contain new value.
+*/
+
+int simple_delete_flush_test()
+{
+ unsigned char *buffw= malloc(PAGE_SIZE);
+ unsigned char *buffr= malloc(PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_delete_flush_test");
+ /* prepare the file */
+ bfill(buffw, PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ /* test */
+ bfill(buffw, PAGE_SIZE, '\2');
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ pagecache_delete(&pagecache, &file1, 0,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, 1);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE,
+ simple_delete_flush_test_file))),
+ "Simple delete-forget page file");
+ if (res)
+ reset_file(file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ write then read file bigger then cache
+*/
+
+int simple_big_test()
+{
+ unsigned char *buffw= (unsigned char *)malloc(PAGE_SIZE);
+ unsigned char *buffr= (unsigned char *)malloc(PAGE_SIZE);
+ struct file_desc *desc=
+ (struct file_desc *)malloc((PCACHE_SIZE/(PAGE_SIZE/2) + 1) *
+ sizeof(struct file_desc));
+ int res, i;
+ DBUG_ENTER("simple_big_test");
+ /* prepare the file twice larger then cache */
+ for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE/2); i++)
+ {
+ bfill(buffw, PAGE_SIZE, (unsigned char) (i & 0xff));
+ desc[i].length= PAGE_SIZE;
+ desc[i].content= (i & 0xff);
+ pagecache_write(&pagecache, &file1, i, 3, (char*)buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ }
+ desc[i].length= 0;
+ desc[i].content= '\0';
+ ok(1, "Simple big file write");
+ /* check written pages sequentally read */
+ for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE/2); i++)
+ {
+ int j;
+ pagecache_read(&pagecache, &file1, i, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ for(j= 0; j < PAGE_SIZE; j++)
+ {
+ if (buffr[j] != (i & 0xff))
+ {
+ diag("simple_big_test seq: page %u byte %u mismatch\n", i, j);
+ return 0;
+ }
+ }
+ }
+ ok(1, "Simple big file sequential read");
+ /* chack random reads */
+ for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE); i++)
+ {
+ int j, page;
+ page= rand() % (PCACHE_SIZE/(PAGE_SIZE/2));
+ pagecache_read(&pagecache, &file1, page, 3, (char*)buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ for(j= 0; j < PAGE_SIZE; j++)
+ {
+ if (buffr[j] != (page & 0xff))
+ {
+ diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j);
+ return 0;
+ }
+ }
+ }
+ ok(1, "Simple big file random read");
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+
+ ok((res= test(test_file(file1, file1_name, PCACHE_SIZE*2, PAGE_SIZE,
+ desc))),
+ "Simple big file");
+ if (res)
+ reset_file(file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+/*
+ Thread function
+*/
+
+static void *test_thread(void *arg)
+{
+#ifndef DBUG_OFF
+ int param= *((int*) arg);
+#endif
+
+ my_thread_init();
+ DBUG_ENTER("test_thread");
+
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ if (!simple_read_write_test() ||
+ !simple_read_change_write_read_test() ||
+ !simple_pin_test() ||
+ !simple_delete_forget_test() ||
+ !simple_delete_flush_test())
+ exit(1);
+
+ SKIP_BIG_TESTS(4)
+ {
+ if (!simple_big_test())
+ exit(1);
+ }
+
+ DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ DBUG_RETURN(0);
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0)
+ {
+ fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ my_pwrite(file1.file, "test file", 9, 0, MYF(0));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ fprintf(stderr,
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ plan(12);
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+
+ if ((error=pthread_mutex_lock(&LOCK_thread_count)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_mutex_lock (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ param=(int*) malloc(sizeof(int));
+ *param= 1;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ if ((error= pthread_mutex_lock(&LOCK_thread_count)))
+ fprintf(stderr,"Got error: %d from pthread_mutex_lock\n",error);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+ fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error);
+ }
+ if ((error= pthread_mutex_unlock(&LOCK_thread_count)))
+ fprintf(stderr,"Got error: %d from pthread_mutex_unlock\n",error);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ fprintf(stderr, "Got error during file1 closing from close() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ /*my_delete(file1_name, MYF(0));*/
+ my_end(0);
+
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+
+ DBUG_PRINT("info", ("Program end"));
+
+ DBUG_RETURN(exit_status());
+}
diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c
new file mode 100644
index 00000000000..6ea45f80433
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler-t.c
@@ -0,0 +1,617 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LONG_BUFFER_SIZE (100 * 1024)
+
+#ifdef LONG_LOG_TEST
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE (1024L*1024L)
+#define ITERATIONS (1600*4)
+
+#else
+#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC)
+#define LOG_FILE_SIZE (1024L*1024L*3L)
+#define ITERATIONS 1600
+#endif
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*1024L
+#define ITERATIONS 181000
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ uchar buff[2];
+ for (i= 0; i < length; i++)
+ {
+ if (i % 2 == 0)
+ int2store(buff, i >> 1);
+ if (ptr[i] != buff[i % 2])
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) buff[i % 2]);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Report OK for read operation
+
+ SYNOPSIS
+ read_ok()
+ rec the record header
+*/
+
+void read_ok(TRANSLOG_HEADER_BUFFER *rec)
+{
+ char buff[80];
+ snprintf(buff, sizeof(buff), "read record type: %u LSN: (%lu,0x%lx)",
+ rec->type, LSN_IN_PARTS(rec->lsn));
+ ok(1, buff);
+}
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2);
+ if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) !=
+ rec->record_length)
+ return 1;
+ return check_content(buffer + skip, rec->record_length - skip);
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint32 i;
+ uint32 rec_len;
+ uint pagen;
+ uchar long_tr_id[6];
+ uchar lsn_buff[23]=
+ {
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+ };
+ uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2];
+ PAGECACHE pagecache;
+ LSN lsn, lsn_base, first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 3];
+ struct st_translog_scanner_data scanner;
+ int rc;
+
+ MY_INIT(argv[0]);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+
+ for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2)
+ {
+ int2store(long_buffer + i, (i >> 1));
+ /* long_buffer[i]= (i & 0xFF); */
+ }
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1);
+
+ srandom(122334817L);
+
+ long_tr_id[5]= 0xff;
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= 0;
+ trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL,
+ 6, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ lsn_base= first_lsn= lsn;
+
+ for (i= 1; i < ITERATIONS; i++)
+ {
+ trn->short_id= i % 0xFFFF;
+ if (i % 2)
+ {
+ lsn_store(lsn_buff, lsn_base);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ /* check auto-count feature */
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0;
+ if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE,
+ trn, NULL, LSN_STORE_SIZE, 0, parts, NULL))
+ {
+ fprintf(stderr, "1 Can't write reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 12)
+ rec_len= 12;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ /* check record length auto-counting */
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL))
+ {
+ fprintf(stderr, "1 Can't write var reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ }
+ else
+ {
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+ trn, NULL,
+ 23, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "0 Can't write reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 19)
+ rec_len= 19;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+ trn, NULL, 14 + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2, parts, NULL))
+ {
+ fprintf(stderr, "0 Can't write var reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ }
+ int4store(long_tr_id, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+ lsn_base= lsn;
+
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 9)
+ rec_len= 9;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ trn, NULL, rec_len,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ if (translog_flush(lsn))
+ {
+ fprintf(stderr, "Can't flush #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "flush");
+ exit(1);
+ }
+ ok(1, "flush");
+ }
+
+ srandom(122334817L);
+
+ rc= 1;
+
+ {
+ int len= translog_read_record_header(first_lsn, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+ "lsn(%lu,0x%lx)\n",
+ (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+ (uint) uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+ lsn= first_lsn;
+ if (translog_init_scanner(first_lsn, 1, &scanner))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 1;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != ITERATIONS)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS);
+ goto err;
+ }
+ break;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 7 || ref != lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+ "data read(%d) "
+ "type: %u strid: %u len: %u"
+ "ref: (%lu,0x%lx) (%lu,0x%lx) "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn),
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 23 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ ((uchar)rec.header[22]) != 0x55 ||
+ ((uchar)rec.header[21]) != 0xAA ||
+ ((uchar)rec.header[20]) != 0x55 ||
+ ((uchar)rec.header[19]) != 0xAA ||
+ ((uchar)rec.header[18]) != 0x55 ||
+ ((uchar)rec.header[17]) != 0xAA ||
+ ((uchar)rec.header[16]) != 0x55 ||
+ ((uchar)rec.header[15]) != 0xAA ||
+ ((uchar)rec.header[14]) != 0x55)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+ "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ (uint) rec.header[14], (uint) rec.header[15],
+ (uint) rec.header[16], (uint) rec.header[17],
+ (uint) rec.header[18], (uint) rec.header[19],
+ (uint) rec.header[20], (uint) rec.header[21],
+ (uint) rec.header[22],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+ "failed (%d)\n", i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 12)
+ rec_len= 12;
+ if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE ||
+ len != 12 || ref != lsn ||
+ check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+ "hdr len: %u (%d), "
+ "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+ i, (uint) rec.type,
+ rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ (uint) rec.short_trid,
+ rec.short_trid != (i % 0xFFFF),
+ (ulong) rec.record_length, (ulong) rec_len,
+ rec.record_length != rec_len + LSN_STORE_SIZE,
+ (uint) len,
+ len != 12,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+ (len != 12 || ref != lsn),
+ check_content(rec.header + LSN_STORE_SIZE,
+ len - LSN_STORE_SIZE));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 19)
+ rec_len= 19;
+ if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+ len != 19 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ check_content(rec.header + LSN_STORE_SIZE * 2,
+ len - LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+ "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 6 || uint4korr(rec.header) != i ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ (uint) uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ lsn= rec.lsn;
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 9)
+ rec_len= 9;
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len ||
+ len != 9 || check_content(rec.header, (uint)len))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "read record");
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ if (maria_log_remove())
+ exit(1);
+
+ return(test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
new file mode 100644
index 00000000000..28233ae04cb
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
@@ -0,0 +1,147 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn, first_lsn, theor_lsn;
+ MY_STAT st;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(2);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0)))
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ if (my_stat(first_translog_file, &st, MYF(0)))
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ theor_lsn= translog_first_theoretical_lsn();
+ if (theor_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ if (theor_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "There is no first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ first_lsn= translog_first_lsn_in_log();
+ if (first_lsn != LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+ LSN_IN_PARTS(first_lsn));
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "Empty log response");
+
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ theor_lsn= translog_first_theoretical_lsn();
+ if (theor_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (theor_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "There is no first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ first_lsn= translog_first_lsn_in_log();
+ if (first_lsn != theor_lsn)
+ {
+ fprintf(stderr, "Incorrect first lsn: (%lu,0x%lx) "
+ " theoretical first: (%lu,0x%lx)\n",
+ LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Full log response");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
new file mode 100644
index 00000000000..d6f0bde7a8e
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
@@ -0,0 +1,140 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (4*1024L*1024L)
+#define LOG_FLAGS 0
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ ulong i;
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(2);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ max_lsn= translog_get_file_max_lsn_stored(1);
+ if (max_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn != LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+ LSN_IN_PARTS(max_lsn));
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "Empty log response");
+
+
+ /* write more then 1 file */
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ for(i= 0; i < LOG_FILE_SIZE/6; i++)
+ {
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+ if (LSN_FILE_NO(lsn) == 1)
+ last_lsn= lsn;
+ }
+
+
+ max_lsn= translog_get_file_max_lsn_stored(1);
+ if (max_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Isn't first file still finished?!!\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn != last_lsn)
+ {
+ fprintf(stderr, "Incorrect max lsn: (%lu,0x%lx) "
+ " last lsn on first file: (%lu,0x%lx)\n",
+ LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "First file max LSN");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
new file mode 100644
index 00000000000..d5f00bdb6fd
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
@@ -0,0 +1,641 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512))
+
+#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1)
+
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 2
+/*#define ITERATIONS 63 */
+
+/*
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+/*
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ uchar buff[4];
+ DBUG_ENTER("check_content");
+ for (i= 0; i < length; i++)
+ {
+ if (i % 4 == 0)
+ int4store(buff, (i >> 2));
+ if (ptr[i] != buff[i % 4])
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) buff[i % 4]);
+ DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0),
+ (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ int res= 0;
+ translog_size_t len;
+ DBUG_ENTER("read_and_check_content");
+ DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+ if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+ buffer, NULL)) != rec->record_length)
+ {
+ fprintf(stderr, "Requested %lu byte, read %lu\n",
+ (ulong) rec->record_length, (ulong) len);
+ res= 1;
+ }
+ res|= check_content(buffer + skip, rec->record_length - skip);
+ DBUG_RETURN(res);
+}
+
+
+static uint32 get_len()
+{
+ uint32 rec_len;
+ do
+ {
+ rec_len= random() /
+ (RAND_MAX / (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)) + MIN_REC_LENGTH;
+ } while (rec_len >= LONG_BUFFER_SIZE);
+ return rec_len;
+}
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint32 i;
+ uint32 rec_len;
+ uint pagen;
+ uchar long_tr_id[6];
+ uchar lsn_buff[23]=
+ {
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+ };
+ uchar *long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+ PAGECACHE pagecache;
+ LSN lsn, lsn_base, first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 2];
+ struct st_translog_scanner_data scanner;
+ int rc;
+
+ MY_INIT(argv[0]);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+
+ {
+ uchar buff[4];
+ for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++)
+ {
+ if (i % 4 == 0)
+ int4store(buff, (i >> 2));
+ long_buffer[i]= buff[i % 4];
+ }
+ }
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ plan(((ITERATIONS - 1) * 4 + 1) * 2);
+
+ srandom(122334817L);
+
+ long_tr_id[5]= 0xff;
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= 0;
+ trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL,
+ 6, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ lsn_base= first_lsn= lsn;
+
+ for (i= 1; i < ITERATIONS; i++)
+ {
+ if (i % 2)
+ {
+ lsn_store(lsn_buff, lsn_base);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_1LSN_EXAMPLE,
+ trn, NULL,
+ LSN_STORE_SIZE,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "1 Can't write reference before record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ trn, NULL, LSN_STORE_SIZE + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL))
+ {
+ fprintf(stderr, "1 Can't write var reference before record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ }
+ else
+ {
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+ trn, NULL, 23,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "0 Can't write reference before record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+ trn, NULL, LSN_STORE_SIZE * 2 + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL))
+ {
+ fprintf(stderr, "0 Can't write var reference before record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ }
+ int4store(long_tr_id, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+ lsn_base= lsn;
+
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ trn, NULL, rec_len,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ }
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "pass2: Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, 0))
+ {
+ fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+
+ srandom(122334817L);
+
+ rc= 1;
+
+ {
+ int len= translog_read_record_header(first_lsn, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+ "lsn(0x%lu,0x%lx)\n",
+ (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+ (uint)uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+ lsn= first_lsn;
+ if (translog_init_scanner(first_lsn, 1, &scanner))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 1;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != ITERATIONS)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ break;
+ }
+
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != LSN_STORE_SIZE || ref != lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u, strid %u, len %u, ref(%lu,0x%lx), lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 23 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ ((uchar)rec.header[22]) != 0x55 ||
+ ((uchar)rec.header[21]) != 0xAA ||
+ ((uchar)rec.header[20]) != 0x55 ||
+ ((uchar)rec.header[19]) != 0xAA ||
+ ((uchar)rec.header[18]) != 0x55 ||
+ ((uchar)rec.header[17]) != 0xAA ||
+ ((uchar)rec.header[16]) != 0x55 ||
+ ((uchar)rec.header[15]) != 0xAA ||
+ ((uchar)rec.header[14]) != 0x55)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+ "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ (uint) rec.header[14], (uint) rec.header[15],
+ (uint) rec.header[16], (uint) rec.header[17],
+ (uint) rec.header[18], (uint) rec.header[19],
+ (uint) rec.header[20], (uint) rec.header[21],
+ (uint) rec.header[22],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+ "failed (%d)\n", i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ rec_len= get_len();
+ if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE ||
+ len != 12 || ref != lsn ||
+ check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+ "hdr len: %d (%d), "
+ "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+ i, (uint) rec.type,
+ rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ (uint) rec.short_trid,
+ rec.short_trid != (i % 0xFFFF),
+ (ulong) rec.record_length, (ulong) rec_len,
+ rec.record_length != rec_len + LSN_STORE_SIZE,
+ len,
+ len != 12,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+ (ref != lsn),
+ check_content(rec.header + LSN_STORE_SIZE,
+ len - LSN_STORE_SIZE));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ rec_len= get_len();
+ if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+ len != 19 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ check_content(rec.header + LSN_STORE_SIZE * 2,
+ len - LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ " data read(%d) "
+ "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+ "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 6 || uint4korr(rec.header) != i ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ (uint)uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ lsn= rec.lsn;
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ rec_len= get_len();
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len ||
+ len != 9 || check_content(rec.header, len))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "read record");
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+
+ return (test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
new file mode 100644
index 00000000000..6255c11db89
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
@@ -0,0 +1,479 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+
+/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */
+#define LOG_FLAGS 0
+/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/
+#define LONG_BUFFER_SIZE (1024L*1024L*1024L)
+#define MIN_REC_LENGTH 30
+#define SHOW_DIVIDER 10
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 3
+#define WRITERS 3
+static uint number_of_writers= WRITERS;
+
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+
+static ulong lens[WRITERS][ITERATIONS];
+static LSN lsns1[WRITERS][ITERATIONS];
+static LSN lsns2[WRITERS][ITERATIONS];
+static uchar *long_buffer;
+
+/*
+ Get pseudo-random length of the field in
+ limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE]
+
+ SYNOPSIS
+ get_len()
+
+ RETURN
+ length - length >= 0 length <= LONG_BUFFER_SIZE
+*/
+
+static uint32 get_len()
+{
+ uint32 rec_len;
+ do
+ {
+ rec_len= random() /
+ (RAND_MAX / (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)) + MIN_REC_LENGTH;
+ } while (rec_len >= LONG_BUFFER_SIZE);
+ return rec_len;
+}
+
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ for (i= 0; i < length; i++)
+ {
+ if (((uchar)ptr[i]) != (i & 0xFF))
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) (i & 0xFF));
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ int res= 0;
+ translog_size_t len;
+
+ if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+ buffer, NULL)) != rec->record_length)
+ {
+ fprintf(stderr, "Requested %lu byte, read %lu\n",
+ (ulong) rec->record_length, (ulong) len);
+ res= 1;
+ }
+ res|= check_content(buffer + skip, rec->record_length - skip);
+ return(res);
+}
+
+void writer(int num)
+{
+ LSN lsn;
+ TRN trn;
+ uchar long_tr_id[6];
+ uint i;
+
+ trn.short_id= num;
+ trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ for (i= 0; i < ITERATIONS; i++)
+ {
+ uint len= get_len();
+ lens[num][i]= len;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ int2store(long_tr_id, num);
+ int4store(long_tr_id + 2, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu "
+ "thread %i\n", (ulong) i, num);
+ translog_destroy();
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(0, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ return;
+ }
+ lsns1[num][i]= lsn;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ &trn, NULL,
+ len, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+ translog_destroy();
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(0, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ return;
+ }
+ lsns2[num][i]= lsn;
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ }
+ return;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+ int param= *((int*) arg);
+
+ my_thread_init();
+
+ writer(param);
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ ok(1, "writer finished"); /* just to show progress */
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are
+ ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ return(0);
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__ ((unused)))
+{
+ uint32 i;
+ uint pagen;
+ PAGECACHE pagecache;
+ LSN first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ struct st_translog_scanner_data scanner;
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error;
+ int rc;
+
+ plan(WRITERS + ITERATIONS * WRITERS * 3);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2);
+ if (long_buffer == 0)
+ {
+ fprintf(stderr, "End of memory\n");
+ exit(1);
+ }
+ for (i= 0; i < (LONG_BUFFER_SIZE + 7 * 2 + 2); i++)
+ long_buffer[i]= (i & 0xFF);
+
+ MY_INIT(argv[0]);
+ if (maria_log_remove())
+ exit(1);
+
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ fprintf(stderr, "COND_thread_count: %d from pthread_cond_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_attr_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ fprintf(stderr,
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ my_thread_global_init();
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ srandom(122334817L);
+ {
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar long_tr_id[6]=
+ {
+ 0x11, 0x22, 0x33, 0x44, 0x55, 0x66
+ };
+
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&first_lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write the first record\n");
+ translog_destroy();
+ exit(1);
+ }
+ }
+
+
+ if ((error= pthread_mutex_lock(&LOCK_thread_count)))
+ {
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_lock "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+
+ while (number_of_writers != 0)
+ {
+ param= (int*) malloc(sizeof(int));
+ *param= number_of_writers - 1;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ if ((error= pthread_mutex_lock(&LOCK_thread_count)))
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_lock\n", error);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+ fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error);
+ }
+ if ((error= pthread_mutex_unlock(&LOCK_thread_count)))
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_unlock\n", error);
+
+ /* Find last LSN and flush up to it (all our log) */
+ {
+ LSN max= 0;
+ for (i= 0; i < WRITERS; i++)
+ {
+ if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0)
+ max= lsns2[i][ITERATIONS - 1];
+ }
+ translog_flush(max);
+ }
+
+ rc= 1;
+
+ {
+ uint indeces[WRITERS];
+ uint index, stage;
+ int len;
+ bzero(indeces, sizeof(uint) * WRITERS);
+
+ bzero(indeces, sizeof(indeces));
+
+ if (translog_init_scanner(first_lsn, 1, &scanner))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 0;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != WRITERS * ITERATIONS * 2)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS * WRITERS * 2);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ break;
+ }
+ index= indeces[rec.short_trid] / 2;
+ stage= indeces[rec.short_trid] % 2;
+ if (stage == 0)
+ {
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.record_length != 6 ||
+ uint2korr(rec.header) != rec.short_trid ||
+ index != uint4korr(rec.header + 2) ||
+ cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u %u, len %u, i: %u %u, "
+ "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+ i, (uint) rec.type,
+ (uint) rec.short_trid, (uint) uint2korr(rec.header),
+ (uint) rec.record_length,
+ (uint) index, (uint) uint4korr(rec.header + 2),
+ LSN_IN_PARTS(rec.lsn),
+ LSN_IN_PARTS(lsns1[rec.short_trid][index]));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ len != 9 ||
+ rec.record_length != lens[rec.short_trid][index] ||
+ cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 ||
+ check_content(rec.header, (uint)len))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "thread: %d, iteration %d, stage %d\n"
+ "type %u (%d), len %d, length %lu %lu (%d) "
+ "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+ i, (uint) rec.short_trid, index, stage,
+ (uint) rec.type, (rec.type !=
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE),
+ len,
+ (ulong) rec.record_length, lens[rec.short_trid][index],
+ (rec.record_length != lens[rec.short_trid][index]),
+ LSN_IN_PARTS(rec.lsn),
+ LSN_IN_PARTS(lsns2[rec.short_trid][index]));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ ok(1, "record read");
+ translog_free_record_header(&rec);
+ indeces[rec.short_trid]++;
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "record read");
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+
+ return(exit_status());
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
new file mode 100644
index 00000000000..2c3afb9a76b
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
@@ -0,0 +1,132 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ int rc= 1;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN first_lsn;
+ MY_STAT st;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(1);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0)))
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ if (my_stat(first_translog_file, &st, MYF(0)))
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ int4store(long_tr_id, 0);
+ long_tr_id[5]= 0xff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&first_lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_size_t len= translog_read_record_header(first_lsn, &rec);
+ if (len == 0)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type: %u (%d) strid: %u (%d) len: %u (%d) i: %u (%d), "
+ "4: %u (%d) 5: %u (%d) "
+ "lsn(%lu,0x%lx) (%d)\n",
+ (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE),
+ (uint) rec.short_trid, (rec.short_trid != 0),
+ (uint) rec.record_length, (rec.record_length != 6),
+ (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0),
+ (uint) rec.header[4], (((uchar)rec.header[4]) != 0),
+ (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF),
+ LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn));
+ goto err;
+ }
+
+ ok(1, "read OK");
+ rc= 0;
+
+err:
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+
+ exit(rc);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
new file mode 100644
index 00000000000..276640dfd17
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
@@ -0,0 +1,159 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn;
+ MY_STAT st, *stat;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(1);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0)))
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ if (my_stat(first_translog_file, &st, MYF(0)))
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ if ((stat= my_stat(first_translog_file, &st, MYF(0))) == 0)
+ {
+ fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno);
+ exit(1);
+ }
+ if (st.st_size != TRANSLOG_PAGE_SIZE)
+ {
+ fprintf(stderr,
+ "incorrect initial size of %s: %ld instead of %ld\n",
+ first_translog_file, (long)st.st_size, (long)TRANSLOG_PAGE_SIZE);
+ exit(1);
+ }
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0)
+ {
+ fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+
+ {
+ uchar page[PCACHE_PAGE];
+
+ bzero(page, PCACHE_PAGE);
+#define PAGE_LSN_OFFSET 0
+ lsn_store(page + PAGE_LSN_OFFSET, lsn);
+ pagecache_write(&pagecache, &file1, 0, 3, (char*)page,
+ PAGECACHE_LSN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ }
+ if ((stat= my_stat(first_translog_file, &st, MYF(0))) == 0)
+ {
+ fprintf(stderr, "can't stat %s (%d)\n", first_translog_file, errno);
+ exit(1);
+ }
+ if (st.st_size != TRANSLOG_PAGE_SIZE * 2)
+ {
+ fprintf(stderr,
+ "incorrect initial size of %s: %ld instead of %ld\n",
+ first_translog_file,
+ (long)st.st_size, (long)(TRANSLOG_PAGE_SIZE * 2));
+ ok(0, "log triggered");
+ exit(1);
+ }
+ ok(1, "log triggered");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ my_delete(first_translog_file, MYF(0));
+ my_delete(file1_name, MYF(0));
+
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c
new file mode 100644
index 00000000000..c638aa85ac6
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c
@@ -0,0 +1,176 @@
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (4*1024L*1024L)
+#define LOG_FLAGS 0
+#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2)
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ ulong i;
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn;
+ LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar *long_buffer= malloc(LONG_BUFFER_SIZE);
+
+ MY_INIT(argv[0]);
+
+ plan(4);
+
+ bzero(&pagecache, sizeof(pagecache));
+ bzero(long_buffer, LONG_BUFFER_SIZE);
+ maria_data_root= ".";
+ if (maria_log_remove())
+ exit(1);
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_create_or_open(TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ translog_destroy();
+ exit(1);
+ }
+ example_loghandler_init();
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ /* write more then 1 file */
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (!translog_is_file(1))
+ {
+ fprintf(stderr, "First file was removed after first record\n");
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "First is not removed");
+
+ for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++)
+ {
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+ }
+
+ translog_purge(lsn);
+ if (translog_is_file(1))
+ {
+ fprintf(stderr, "First file was not removed.\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "First file is removed");
+
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, LONG_BUFFER_SIZE,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL))
+ {
+ fprintf(stderr, "Can't write variable record\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (!translog_is_file(2) || !translog_is_file(3))
+ {
+ fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n",
+ translog_is_file(2), translog_is_file(3));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Second and third files are not removed");
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL))
+ {
+ fprintf(stderr, "Can't write last record\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (translog_is_file(2))
+ {
+ fprintf(stderr, "Second file is not removed\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Second file is removed");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c
new file mode 100644
index 00000000000..758d0bfa81b
--- /dev/null
+++ b/storage/maria/unittest/test_file.c
@@ -0,0 +1,68 @@
+#include <tap.h>
+#include <my_sys.h>
+#include <my_dir.h>
+#include "test_file.h"
+
+
+/*
+ Check that file contance correspond to descriptor
+
+ SYNOPSIS
+ test_file()
+ file File to test
+ file_name Path (and name) of file which is tested
+ size size of file
+ buff_size size of buffer which is enought to check the file
+ desc file descriptor to check with
+
+ RETURN
+ 1 file if OK
+ 0 error
+*/
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+ off_t size, size_t buff_size, struct file_desc *desc)
+{
+ MY_STAT stat_buff, *stat;
+ unsigned char *buffr= malloc(buff_size);
+ off_t pos= 0;
+ size_t byte;
+ int step= 0;
+
+ if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL)
+ {
+ diag("Can't stat() %s (errno: %d)\n", file_name, errno);
+ return 0;
+ }
+ if (stat->st_size != size)
+ {
+ diag("file %s size is %lu (should be %lu)\n",
+ file_name, (ulong) stat->st_size, (ulong) size);
+ return 0;
+ }
+ /* check content */
+ my_seek(file.file, 0, SEEK_SET, MYF(0));
+ while (desc[step].length != 0)
+ {
+ if (my_read(file.file, (char*)buffr, desc[step].length, MYF(0)) !=
+ desc[step].length)
+ {
+ diag("Can't read %u bytes from %s (errno: %d)\n",
+ (uint)desc[step].length, file_name, errno);
+ return 0;
+ }
+ for (byte= 0; byte < desc[step].length; byte++)
+ {
+ if (buffr[byte] != desc[step].content)
+ {
+ diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n",
+ file_name, (uint) buffr[byte], (ulong) (pos + byte),
+ desc[step].content);
+ return 0;
+ }
+ }
+ pos+= desc[step].length;
+ step++;
+ }
+ return 1;
+}
diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h
new file mode 100644
index 00000000000..293c692717e
--- /dev/null
+++ b/storage/maria/unittest/test_file.h
@@ -0,0 +1,14 @@
+#include <m_string.h>
+#include "../ma_pagecache.h"
+
+/*
+ File content descriptor
+*/
+struct file_desc
+{
+ unsigned int length;
+ unsigned char content;
+};
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+ off_t size, size_t buff_size, struct file_desc *desc);
diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c
new file mode 100644
index 00000000000..db137cf088c
--- /dev/null
+++ b/storage/maria/unittest/trnman-t.c
@@ -0,0 +1,194 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include <m_string.h>
+#include "../trnman.h"
+
+pthread_mutex_t rt_mutex;
+pthread_attr_t attr;
+size_t stacksize= 0;
+#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION)
+
+int rt_num_threads;
+int litmus;
+
+/*
+ create and end (commit or rollback) transactions randomly
+*/
+#define MAX_ITER 100
+pthread_handler_t test_trnman(void *arg)
+{
+ uint x, y, i, n;
+ TRN *trn[MAX_ITER];
+ pthread_mutex_t mutexes[MAX_ITER];
+ pthread_cond_t conds[MAX_ITER];
+ int m= (*(int *)arg);
+
+ for (i= 0; i < MAX_ITER; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&conds[i], 0);
+ }
+
+ for (x= ((int)(intptr)(&m)); m > 0; )
+ {
+ y= x= (x*LL(3628273133) + LL(1500450271)) % LL(9576890767); /* three prime numbers */
+ m-= n= x % MAX_ITER;
+ for (i= 0; i < n; i++)
+ {
+ trn[i]= trnman_new_trn(&mutexes[i], &conds[i], &m + STACK_SIZE);
+ if (!trn[i])
+ {
+ diag("trnman_new_trn() failed");
+ litmus++;
+ }
+ }
+ for (i= 0; i < n; i++)
+ {
+ y= (y*19 + 7) % 31;
+ trnman_end_trn(trn[i], y & 1);
+ }
+ }
+ for (i= 0; i < MAX_ITER; i++)
+ {
+ pthread_mutex_destroy(&mutexes[i]);
+ pthread_cond_destroy(&conds[i]);
+ }
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+#undef MAX_ITER
+
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Testing %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, &attr, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+#define ok_read_from(T1, T2, RES) \
+ i= trnman_can_read_from(trn[T1], trn[T2]->trid); \
+ ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot")
+#define start_transaction(T) \
+ trn[T]= trnman_new_trn(&mutexes[T], &conds[T], &i + STACK_SIZE)
+#define commit(T) trnman_commit_trn(trn[T])
+#define abort(T) trnman_abort_trn(trn[T])
+
+#define Ntrns 4
+void test_trnman_read_from()
+{
+ TRN *trn[Ntrns];
+ pthread_mutex_t mutexes[Ntrns];
+ pthread_cond_t conds[Ntrns];
+ int i;
+
+ for (i= 0; i < Ntrns; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&conds[i], 0);
+ }
+
+ start_transaction(0); /* start trn1 */
+ start_transaction(1); /* start trn2 */
+ ok_read_from(1, 0, 0);
+ commit(0); /* commit trn1 */
+ start_transaction(2); /* start trn4 */
+ abort(2); /* abort trn4 */
+ start_transaction(3); /* start trn5 */
+ ok_read_from(3, 0, 1);
+ ok_read_from(3, 1, 0);
+ ok_read_from(3, 2, 0);
+ commit(1); /* commit trn2 */
+ ok_read_from(3, 1, 0);
+ commit(3); /* commit trn5 */
+
+ for (i= 0; i < Ntrns; i++)
+ {
+ pthread_mutex_destroy(&mutexes[i]);
+ pthread_cond_destroy(&conds[i]);
+ }
+}
+
+int main()
+{
+ my_init();
+
+ plan(6);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+ pthread_mutex_init(&rt_mutex, 0);
+ pthread_attr_init(&attr);
+#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE
+ pthread_attr_getstacksize(&attr, &stacksize);
+ if (stacksize == 0)
+#endif
+ stacksize= PTHREAD_STACK_MIN;
+
+#define CYCLES 10000
+#define THREADS 10
+
+ trnman_init(0);
+
+ test_trnman_read_from();
+ run_test("trnman", test_trnman, THREADS, CYCLES);
+
+ diag("mallocs: %d", trnman_allocated_transactions);
+ {
+ ulonglong now= my_getsystime();
+ trnman_destroy();
+ now= my_getsystime()-now;
+ diag("trnman_destroy: %g", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/myisam/Makefile.am b/storage/myisam/Makefile.am
index f50c312b8e4..4bd0b177daa 100644
--- a/storage/myisam/Makefile.am
+++ b/storage/myisam/Makefile.am
@@ -97,8 +97,8 @@ libmyisam_a_SOURCES = mi_open.c mi_extra.c mi_info.c mi_rkey.c \
mi_delete_table.c mi_rename.c mi_check.c \
mi_keycache.c mi_preload.c \
ft_parser.c ft_stopwords.c ft_static.c \
- ft_update.c ft_boolean_search.c ft_nlq_search.c sort.c \
- ha_myisam.cc \
+ ft_update.c ft_boolean_search.c ft_nlq_search.c \
+ sort.c ha_myisam.cc ft_myisam.c \
rt_index.c rt_key.c rt_mbr.c rt_split.c sp_key.c
CLEANFILES = test?.MY? FT?.MY? isam.log mi_test_all rt_test.MY? sp_test.MY?
diff --git a/storage/myisam/ft_boolean_search.c b/storage/myisam/ft_boolean_search.c
index 15f4e1e1d34..85342c6e0ca 100644
--- a/storage/myisam/ft_boolean_search.c
+++ b/storage/myisam/ft_boolean_search.c
@@ -162,7 +162,7 @@ static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
{
/* ORDER BY word DESC, ndepth DESC */
- int i= mi_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1,
+ int i= ha_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1,
(uchar*) (*a)->word+1,(*a)->len-1,0,0);
if (!i)
i=CMP_NUM((*b)->ndepth,(*a)->ndepth);
@@ -196,7 +196,7 @@ static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
case FT_TOKEN_WORD:
ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
sizeof(FTB_WORD) +
- (info->trunc ? MI_MAX_KEY_BUFF :
+ (info->trunc ? HA_MAX_KEY_BUFF :
word_len * ftb_param->ftb->charset->mbmaxlen +
HA_FT_WLEN +
ftb_param->ftb->info->s->rec_reflength));
@@ -345,7 +345,6 @@ static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength;
uchar *lastkey_buf=ftbw->word+ftbw->off;
- LINT_INIT(off);
if (ftbw->flags & FTB_FLAG_TRUNC)
lastkey_buf+=ftbw->len;
@@ -395,7 +394,7 @@ static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
if (!r && !ftbw->off)
{
- r= mi_compare_text(ftb->charset,
+ r= ha_compare_text(ftb->charset,
info->lastkey+1,
info->lastkey_length-extra-1,
(uchar*) ftbw->word+1,
@@ -868,7 +867,7 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
{
ftbw= ftb->list[c];
- if (mi_compare_text(ftb->charset, (uchar*)word, len,
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
(uchar*)ftbw->word+1, ftbw->len-1,
(my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) > 0)
b= c;
@@ -878,7 +877,7 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
for (; c >= 0; c--)
{
ftbw= ftb->list[c];
- if (mi_compare_text(ftb->charset, (uchar*)word, len,
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
(uchar*)ftbw->word + 1,ftbw->len - 1,
(my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
break;
diff --git a/storage/myisam/ft_eval.c b/storage/myisam/ft_eval.c
index 7eb78861e5e..de01510fdd7 100644
--- a/storage/myisam/ft_eval.c
+++ b/storage/myisam/ft_eval.c
@@ -48,7 +48,7 @@ int main(int argc, char *argv[])
recinfo[0].type=FIELD_SKIP_ENDSPACE;
recinfo[0].length=docid_length;
recinfo[1].type=FIELD_BLOB;
- recinfo[1].length= 4+mi_portable_sizeof_char_ptr;
+ recinfo[1].length= 4+portable_sizeof_char_ptr;
/* Define a key over the first column */
keyinfo[0].seg=keyseg;
diff --git a/storage/myisam/ft_myisam.c b/storage/myisam/ft_myisam.c
new file mode 100644
index 00000000000..bef3fbfd5f5
--- /dev/null
+++ b/storage/myisam/ft_myisam.c
@@ -0,0 +1,36 @@
+/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*
+ This function is for interface functions between fulltext and myisam
+*/
+
+#include "ftdefs.h"
+
+FT_INFO *ft_init_search(uint flags, void *info, uint keynr,
+ uchar *query, uint query_len, CHARSET_INFO *cs,
+ uchar *record)
+{
+ FT_INFO *res;
+ if (flags & FT_BOOL)
+ res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs);
+ else
+ res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags,
+ record);
+ return res;
+}
diff --git a/storage/myisam/ft_nlq_search.c b/storage/myisam/ft_nlq_search.c
index 282fa6751d8..b3a2e47a382 100644
--- a/storage/myisam/ft_nlq_search.c
+++ b/storage/myisam/ft_nlq_search.c
@@ -103,7 +103,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
{
if (keylen &&
- mi_compare_text(aio->charset,info->lastkey+1,
+ ha_compare_text(aio->charset,info->lastkey+1,
info->lastkey_length-extra-1, keybuff+1,keylen-1,0,0))
break;
diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c
index df2423aa50f..042a999fffa 100644
--- a/storage/myisam/ft_parser.c
+++ b/storage/myisam/ft_parser.c
@@ -31,7 +31,7 @@ typedef struct st_my_ft_parser_param
static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
{
- return mi_compare_text(cs, (uchar*) w1->pos, w1->len,
+ return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
(uchar*) w2->pos, w2->len, 0, 0);
}
diff --git a/storage/myisam/ft_static.c b/storage/myisam/ft_static.c
index 610c20eede6..d48bedc9e3b 100644
--- a/storage/myisam/ft_static.c
+++ b/storage/myisam/ft_static.c
@@ -54,20 +54,6 @@ const struct _ft_vft _ft_vft_boolean = {
ft_boolean_get_relevance, ft_boolean_reinit_search
};
-
-FT_INFO *ft_init_search(uint flags, void *info, uint keynr,
- uchar *query, uint query_len, CHARSET_INFO *cs,
- uchar *record)
-{
- FT_INFO *res;
- if (flags & FT_BOOL)
- res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs);
- else
- res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags,
- record);
- return res;
-}
-
const char *ft_stopword_file = 0;
const char *ft_precompiled_stopwords[] = {
diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c
index 59866d9a351..8aefffbee1d 100644
--- a/storage/myisam/ft_stopwords.c
+++ b/storage/myisam/ft_stopwords.c
@@ -29,7 +29,7 @@ static TREE *stopwords3=NULL;
static int FT_STOPWORD_cmp(void* cmp_arg __attribute__((unused)),
FT_STOPWORD *w1, FT_STOPWORD *w2)
{
- return mi_compare_text(default_charset_info,
+ return ha_compare_text(default_charset_info,
(uchar *)w1->pos,w1->len,
(uchar *)w2->pos,w2->len,0,0);
}
@@ -51,10 +51,11 @@ static int ft_add_stopword(const char *w)
int ft_init_stopwords()
{
+ DBUG_ENTER("ft_init_stopwords");
if (!stopwords3)
{
if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0))))
- return -1;
+ DBUG_RETURN(-1);
init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp,
0,
(ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0),
@@ -70,10 +71,10 @@ int ft_init_stopwords()
int error=-1;
if (!*ft_stopword_file)
- return 0;
+ DBUG_RETURN(0);
if ((fd=my_open(ft_stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
- return -1;
+ DBUG_RETURN(-1);
len=(uint)my_seek(fd, 0L, MY_SEEK_END, MYF(0));
my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
if (!(start=buffer=my_malloc(len+1, MYF(MY_WME))))
@@ -90,7 +91,7 @@ err1:
my_free(buffer, MYF(0));
err0:
my_close(fd, MYF(MY_WME));
- return error;
+ DBUG_RETURN(error);
}
else
{
@@ -100,13 +101,14 @@ err0:
for (;*sws;sws++)
{
if (ft_add_stopword(*sws))
- return -1;
+ DBUG_RETURN(-1);
}
ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
}
- return 0;
+ DBUG_RETURN(0);
}
+
int is_stopword(char *word, uint len)
{
FT_STOPWORD sw;
@@ -118,6 +120,8 @@ int is_stopword(char *word, uint len)
void ft_free_stopwords()
{
+ DBUG_ENTER("ft_free_stopwords");
+
if (stopwords3)
{
delete_tree(stopwords3); /* purecov: inspected */
@@ -125,4 +129,5 @@ void ft_free_stopwords()
stopwords3=0;
}
ft_stopword_file= 0;
+ DBUG_VOID_RETURN;
}
diff --git a/storage/myisam/ft_test1.c b/storage/myisam/ft_test1.c
index e49c47bb268..b37935a0d7a 100644
--- a/storage/myisam/ft_test1.c
+++ b/storage/myisam/ft_test1.c
@@ -75,12 +75,12 @@ static int run_test(const char *filename)
/* First define 2 columns */
recinfo[0].type=extra_field;
- recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + mi_portable_sizeof_char_ptr :
+ recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr :
extra_length);
if (extra_field == FIELD_VARCHAR)
recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length);
recinfo[1].type=key_field;
- recinfo[1].length= (key_field == FIELD_BLOB ? 4+mi_portable_sizeof_char_ptr :
+ recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
key_length);
if (key_field == FIELD_VARCHAR)
recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);
diff --git a/storage/myisam/ft_update.c b/storage/myisam/ft_update.c
index e3e4c62158f..d1548e32870 100644
--- a/storage/myisam/ft_update.c
+++ b/storage/myisam/ft_update.c
@@ -180,7 +180,7 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const uchar *rec1, const uchar *rec2)
{
if ((ftsi1.pos != ftsi2.pos) &&
(!ftsi1.pos || !ftsi2.pos ||
- mi_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len,
+ ha_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len,
(uchar*) ftsi2.pos,ftsi2.len,0,0)))
DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
}
@@ -209,7 +209,7 @@ int _mi_ft_update(MI_INFO *info, uint keynr, uchar *keybuf,
error=0;
while(old_word->pos && new_word->pos)
{
- cmp= mi_compare_text(cs, (uchar*) old_word->pos,old_word->len,
+ cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len,
(uchar*) new_word->pos,new_word->len,0,0);
cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5);
diff --git a/storage/myisam/fulltext.h b/storage/myisam/fulltext.h
index 856e93e034d..9aef2d0d002 100644
--- a/storage/myisam/fulltext.h
+++ b/storage/myisam/fulltext.h
@@ -20,18 +20,8 @@
#include "myisamdef.h"
#include "ft_global.h"
-#define HA_FT_WTYPE HA_KEYTYPE_FLOAT
-#define HA_FT_WLEN 4
-#define FT_SEGS 2
-
-#define ft_sintXkorr(A) mi_sint4korr(A)
-#define ft_intXstore(T,A) mi_int4store(T,A)
-
-extern const HA_KEYSEG ft_keysegs[FT_SEGS];
-
int _mi_ft_cmp(MI_INFO *, uint, const uchar *, const uchar *);
int _mi_ft_add(MI_INFO *, uint, uchar *, const uchar *, my_off_t);
int _mi_ft_del(MI_INFO *, uint, uchar *, const uchar *, my_off_t);
uint _mi_ft_convert_to_ft2(MI_INFO *, uint, uchar *);
-
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index ca4c40547ee..5e58565364c 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -22,6 +22,7 @@
#include "mysql_priv.h"
#include <mysql/plugin.h>
#include <m_ctype.h>
+#include <my_bit.h>
#include <myisampack.h>
#include "ha_myisam.h"
#include <stdarg.h>
@@ -56,7 +57,7 @@ static handler *myisam_create_handler(handlerton *hton,
// collect errors printed by mi_check routines
-static void mi_check_print_msg(MI_CHECK *param, const char* msg_type,
+static void mi_check_print_msg(HA_CHECK *param, const char* msg_type,
const char *fmt, va_list args)
{
THD* thd = (THD*)param->thd;
@@ -251,7 +252,8 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d",
(long) found, recpos, minpos, length));
if (recpos != minpos)
- { // Reserved space (Null bits?)
+ {
+ /* reserve space for null bits */
bzero((char*) recinfo_pos, sizeof(*recinfo_pos));
recinfo_pos->type= (int) FIELD_NORMAL;
recinfo_pos++->length= (uint16) (minpos - recpos);
@@ -300,7 +302,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
Check for underlying table conformance
SYNOPSIS
- check_definition()
+ myisam_check_definition()
t1_keyinfo in First table key definition
t1_recinfo in First table record definition
t1_keys in Number of keys in first table
@@ -442,13 +444,13 @@ int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo,
extern "C" {
-volatile int *killed_ptr(MI_CHECK *param)
+volatile int *killed_ptr(HA_CHECK *param)
{
/* In theory Unsafe conversion, but should be ok for now */
return (int*) &(((THD *)(param->thd))->killed);
}
-void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_error(HA_CHECK *param, const char *fmt,...)
{
param->error_printed|=1;
param->out_flag|= O_DATA_LOST;
@@ -458,7 +460,7 @@ void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
va_end(args);
}
-void mi_check_print_info(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_info(HA_CHECK *param, const char *fmt,...)
{
va_list args;
va_start(args, fmt);
@@ -466,7 +468,7 @@ void mi_check_print_info(MI_CHECK *param, const char *fmt,...)
va_end(args);
}
-void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_warning(HA_CHECK *param, const char *fmt,...)
{
param->warning_printed=1;
param->out_flag|= O_DATA_LOST;
@@ -721,7 +723,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
{
if (!file) return HA_ADMIN_INTERNAL_ERROR;
int error;
- MI_CHECK param;
+ HA_CHECK param;
MYISAM_SHARE* share = file->s;
const char *old_proc_info=thd->proc_info;
@@ -732,7 +734,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
param.db_name= table->s->db.str;
param.table_name= table->alias;
param.testflag = check_opt->flags | T_CHECK | T_SILENT;
- param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method;
+ param.stats_method= (enum_handler_stats_method)thd->variables.myisam_stats_method;
if (!(table->db_stat & HA_READ_ONLY))
param.testflag|= T_STATISTICS;
@@ -813,7 +815,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt)
{
int error=0;
- MI_CHECK param;
+ HA_CHECK param;
MYISAM_SHARE* share = file->s;
myisamchk_init(&param);
@@ -824,7 +826,7 @@ int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt)
param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS |
T_DONT_CHECK_CHECKSUM);
param.using_global_keycache = 1;
- param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method;
+ param.stats_method= (enum_handler_stats_method)thd->variables.myisam_stats_method;
if (!(share->state.changed & STATE_NOT_ANALYZED))
return HA_ADMIN_ALREADY_DONE;
@@ -873,7 +875,7 @@ int ha_myisam::restore(THD* thd, HA_CHECK_OPT *check_opt)
err:
{
- MI_CHECK param;
+ HA_CHECK param;
myisamchk_init(&param);
param.thd= thd;
param.op_name= "restore";
@@ -936,7 +938,7 @@ int ha_myisam::backup(THD* thd, HA_CHECK_OPT *check_opt)
err:
{
- MI_CHECK param;
+ HA_CHECK param;
myisamchk_init(&param);
param.thd= thd;
param.op_name= "backup";
@@ -952,7 +954,7 @@ int ha_myisam::backup(THD* thd, HA_CHECK_OPT *check_opt)
int ha_myisam::repair(THD* thd, HA_CHECK_OPT *check_opt)
{
int error;
- MI_CHECK param;
+ HA_CHECK param;
ha_rows start_records;
if (!file) return HA_ADMIN_INTERNAL_ERROR;
@@ -1002,7 +1004,7 @@ int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt)
{
int error;
if (!file) return HA_ADMIN_INTERNAL_ERROR;
- MI_CHECK param;
+ HA_CHECK param;
myisamchk_init(&param);
param.thd = thd;
@@ -1021,7 +1023,7 @@ int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt)
}
-int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
+int ha_myisam::repair(THD *thd, HA_CHECK &param, bool do_optimize)
{
int error=0;
uint local_testflag=param.testflag;
@@ -1209,7 +1211,7 @@ int ha_myisam::assign_to_keycache(THD* thd, HA_CHECK_OPT *check_opt)
if (error != HA_ADMIN_OK)
{
/* Send error to user */
- MI_CHECK param;
+ HA_CHECK param;
myisamchk_init(&param);
param.thd= thd;
param.op_name= "assign_to_keycache";
@@ -1273,7 +1275,7 @@ int ha_myisam::preload_keys(THD* thd, HA_CHECK_OPT *check_opt)
err:
{
- MI_CHECK param;
+ HA_CHECK param;
myisamchk_init(&param);
param.thd= thd;
param.op_name= "preload_keys";
@@ -1380,7 +1382,7 @@ int ha_myisam::enable_indexes(uint mode)
else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
{
THD *thd=current_thd;
- MI_CHECK param;
+ HA_CHECK param;
const char *save_proc_info=thd->proc_info;
thd->proc_info="Creating index";
myisamchk_init(&param);
@@ -1389,7 +1391,8 @@ int ha_myisam::enable_indexes(uint mode)
T_CREATE_MISSING_KEYS);
param.myf_rw&= ~MY_WAIT_IF_FULL;
param.sort_buffer_length= thd->variables.myisam_sort_buff_size;
- param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method;
+ param.stats_method=
+ (enum_handler_stats_method)thd->variables.myisam_stats_method;
param.tmpdir=&mysql_tmpdir_list;
if ((error= (repair(thd,param,0) != HA_ADMIN_OK)) && param.retry_repair)
{
@@ -1890,7 +1893,7 @@ void ha_myisam::get_auto_increment(ulonglong offset, ulonglong increment,
{
ulonglong nr;
int error;
- uchar key[MI_MAX_KEY_LENGTH];
+ uchar key[HA_MAX_KEY_LENGTH];
if (!table->s->next_number_key_offset)
{ // Autoincrement at key-start
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index e8594fc9039..96440b74c9d 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -21,6 +21,7 @@
/* class for the the myisam handler */
#include <myisam.h>
+#include <myisamchk.h>
#include <ft_global.h>
#define HA_RECOVER_NONE 0 /* No automatic recover */
@@ -39,7 +40,7 @@ class ha_myisam: public handler
ulonglong int_table_flags;
char *data_file_name, *index_file_name;
bool can_enable_indexes;
- int repair(THD *thd, MI_CHECK &param, bool optimize);
+ int repair(THD *thd, HA_CHECK &param, bool optimize);
public:
ha_myisam(handlerton *hton, TABLE_SHARE *table_arg);
@@ -56,8 +57,8 @@ class ha_myisam: public handler
HA_READ_ORDER | HA_KEYREAD_ONLY);
}
uint max_supported_keys() const { return MI_MAX_KEY; }
- uint max_supported_key_length() const { return MI_MAX_KEY_LENGTH; }
- uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; }
+ uint max_supported_key_length() const { return HA_MAX_KEY_LENGTH; }
+ uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; }
uint checksum() const;
int open(const char *name, int mode, uint test_if_locked);
diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c
index fe6b716877c..b41f06a5fb8 100644
--- a/storage/myisam/mi_check.c
+++ b/storage/myisam/mi_check.c
@@ -59,14 +59,14 @@
/* Functions defined in this file */
-static int check_k_link(MI_CHECK *param, MI_INFO *info,uint nr);
-static int chk_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
+static int check_k_link(HA_CHECK *param, MI_INFO *info,uint nr);
+static int chk_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
my_off_t page, uchar *buff, ha_rows *keys,
ha_checksum *key_checksum, uint level);
static uint isam_key_length(MI_INFO *info,MI_KEYDEF *keyinfo);
static ha_checksum calc_checksum(ha_rows count);
static int writekeys(MI_SORT_PARAM *sort_param);
-static int sort_one_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
+static int sort_one_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
my_off_t pagepos, File new_file);
static int sort_key_read(MI_SORT_PARAM *sort_param,void *key);
static int sort_ft_key_read(MI_SORT_PARAM *sort_param,void *key);
@@ -80,13 +80,13 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
reg1 SORT_KEY_BLOCKS *key_block,
uchar *key, my_off_t prev_block);
static int sort_delete_record(MI_SORT_PARAM *sort_param);
-/*static int flush_pending_blocks(MI_CHECK *param);*/
-static SORT_KEY_BLOCKS *alloc_key_blocks(MI_CHECK *param, uint blocks,
+/*static int flush_pending_blocks(HA_CHECK *param);*/
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
uint buffer_length);
static ha_checksum mi_byte_checksum(const uchar *buf, uint length);
-static void set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share);
+static void set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share);
-void myisamchk_init(MI_CHECK *param)
+void myisamchk_init(HA_CHECK *param)
{
bzero((uchar*) param,sizeof(*param));
param->opt_follow_links=1;
@@ -108,7 +108,7 @@ void myisamchk_init(MI_CHECK *param)
/* Check the status flags for the table */
-int chk_status(MI_CHECK *param, register MI_INFO *info)
+int chk_status(HA_CHECK *param, register MI_INFO *info)
{
MYISAM_SHARE *share=info->s;
@@ -136,7 +136,7 @@ int chk_status(MI_CHECK *param, register MI_INFO *info)
/* Check delete links */
-int chk_del(MI_CHECK *param, register MI_INFO *info, uint test_flag)
+int chk_del(HA_CHECK *param, register MI_INFO *info, uint test_flag)
{
reg2 ha_rows i;
uint delete_link_length;
@@ -245,7 +245,7 @@ wrong:
/* Check delete links in index file */
-static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr)
+static int check_k_link(HA_CHECK *param, register MI_INFO *info, uint nr)
{
my_off_t next_link;
uint block_size=(nr+1)*MI_MIN_KEY_BLOCK_LENGTH;
@@ -323,7 +323,7 @@ static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr)
/* Check sizes of files */
-int chk_size(MI_CHECK *param, register MI_INFO *info)
+int chk_size(HA_CHECK *param, register MI_INFO *info)
{
int error=0;
register my_off_t skr,size;
@@ -399,7 +399,7 @@ int chk_size(MI_CHECK *param, register MI_INFO *info)
/* Check keys */
-int chk_key(MI_CHECK *param, register MI_INFO *info)
+int chk_key(HA_CHECK *param, register MI_INFO *info)
{
uint key,found_keys=0,full_text_keys=0,result=0;
ha_rows keys;
@@ -584,7 +584,7 @@ do_stat:
} /* chk_key */
-static int chk_index_down(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int chk_index_down(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
my_off_t page, uchar *buff, ha_rows *keys,
ha_checksum *key_checksum, uint level)
{
@@ -731,13 +731,13 @@ int mi_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull,
/* Check if index is ok */
-static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int chk_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
my_off_t page, uchar *buff, ha_rows *keys,
ha_checksum *key_checksum, uint level)
{
int flag;
uint used_length,comp_flag,nod_flag,key_length=0;
- uchar key[MI_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos;
+ uchar key[HA_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos;
my_off_t next_page,record;
char llbuff[22];
uint diff_pos[2];
@@ -934,7 +934,7 @@ static uint isam_key_length(MI_INFO *info, register MI_KEYDEF *keyinfo)
/* Check that record-link is ok */
-int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
+int chk_data_link(HA_CHECK *param, MI_INFO *info,int extend)
{
int error,got_error,flag;
uint key,left_length,b_type,field;
@@ -944,7 +944,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
uchar *record,*to;
char llbuff[22],llbuff2[22],llbuff3[22];
ha_checksum intern_record_checksum;
- ha_checksum key_checksum[MI_MAX_POSSIBLE_KEY];
+ ha_checksum key_checksum[HA_MAX_POSSIBLE_KEY];
my_bool static_row_size;
MI_KEYDEF *keyinfo;
MI_BLOCK_INFO block_info;
@@ -992,6 +992,9 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
if (*killed_ptr(param))
goto err2;
switch (info->s->data_file_type) {
+ case BLOCK_RECORD:
+ DBUG_ASSERT(0); /* Impossible */
+ break;
case STATIC_RECORD:
if (my_b_read(&param->read_cache,(uchar*) record,
info->s->base.pack_reclength))
@@ -1379,7 +1382,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
/* Recover old table by reading each record and writing all keys */
/* Save new datafile-name in temp_filename */
-int mi_repair(MI_CHECK *param, register MI_INFO *info,
+int mi_repair(HA_CHECK *param, register MI_INFO *info,
char * name, int rep_quick)
{
int error,got_error;
@@ -1389,7 +1392,7 @@ int mi_repair(MI_CHECK *param, register MI_INFO *info,
File new_file;
MYISAM_SHARE *share=info->s;
char llbuff[22],llbuff2[22];
- SORT_INFO sort_info;
+ MI_SORT_INFO sort_info;
MI_SORT_PARAM sort_param;
DBUG_ENTER("mi_repair");
@@ -1772,7 +1775,7 @@ int movepoint(register MI_INFO *info, uchar *record, my_off_t oldpos,
/* Tell system that we want all memory for our cache */
-void lock_memory(MI_CHECK *param __attribute__((unused)))
+void lock_memory(HA_CHECK *param __attribute__((unused)))
{
#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */
if (param->opt_lock_memory)
@@ -1788,7 +1791,7 @@ void lock_memory(MI_CHECK *param __attribute__((unused)))
/* Flush all changed blocks to disk */
-int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file)
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file)
{
if (flush_key_blocks(key_cache, file, FLUSH_RELEASE))
{
@@ -1803,12 +1806,12 @@ int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file)
/* Sort index for more efficent reads */
-int mi_sort_index(MI_CHECK *param, register MI_INFO *info, char * name)
+int mi_sort_index(HA_CHECK *param, register MI_INFO *info, char * name)
{
reg2 uint key;
reg1 MI_KEYDEF *keyinfo;
File new_file;
- my_off_t index_pos[MI_MAX_POSSIBLE_KEY];
+ my_off_t index_pos[HA_MAX_POSSIBLE_KEY];
uint r_locks,w_locks;
int old_lock;
MYISAM_SHARE *share=info->s;
@@ -1903,12 +1906,12 @@ err2:
/* Sort records recursive using one index */
-static int sort_one_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int sort_one_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
my_off_t pagepos, File new_file)
{
uint length,nod_flag,used_length, key_length;
uchar *buff,*keypos,*endpos;
- uchar key[MI_MAX_POSSIBLE_KEY_BUFF];
+ uchar key[HA_MAX_POSSIBLE_KEY_BUFF];
my_off_t new_page_pos,next_page;
char llbuff[22];
DBUG_ENTER("sort_one_index");
@@ -2023,7 +2026,7 @@ int change_to_newfile(const char * filename, const char * old_ext,
/* Locks a whole file */
/* Gives an error-message if file can't be locked */
-int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type,
+int lock_file(HA_CHECK *param, File file, my_off_t start, int lock_type,
const char *filetype, const char *filename)
{
if (my_lock(file,lock_type,start,F_TO_EOF,
@@ -2040,7 +2043,7 @@ int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type,
/* Copy a block between two files */
-int filecopy(MI_CHECK *param, File to,File from,my_off_t start,
+int filecopy(HA_CHECK *param, File to,File from,my_off_t start,
my_off_t length, const char *type)
{
char tmp_buff[IO_SIZE],*buff;
@@ -2091,7 +2094,7 @@ err:
<>0 Error
*/
-int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
+int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info,
const char * name, int rep_quick)
{
int got_error;
@@ -2105,7 +2108,7 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
HA_KEYSEG *keyseg;
ulong *rec_per_key_part;
char llbuff[22];
- SORT_INFO sort_info;
+ MI_SORT_INFO sort_info;
ulonglong key_map=share->state.key_map;
DBUG_ENTER("mi_repair_by_sort");
@@ -2511,7 +2514,7 @@ err:
<>0 Error
*/
-int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
+int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info,
const char * name, int rep_quick)
{
#ifndef THREAD
@@ -2530,7 +2533,7 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
char llbuff[22];
IO_CACHE new_data_cache; /* For non-quick repair. */
IO_CACHE_SHARE io_share;
- SORT_INFO sort_info;
+ MI_SORT_INFO sort_info;
ulonglong key_map=share->state.key_map;
pthread_attr_t thr_attr;
DBUG_ENTER("mi_repair_parallel");
@@ -3009,7 +3012,7 @@ err:
static int sort_key_read(MI_SORT_PARAM *sort_param, void *key)
{
int error;
- SORT_INFO *sort_info=sort_param->sort_info;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
MI_INFO *info=sort_info->info;
DBUG_ENTER("sort_key_read");
@@ -3036,7 +3039,7 @@ static int sort_key_read(MI_SORT_PARAM *sort_param, void *key)
static int sort_ft_key_read(MI_SORT_PARAM *sort_param, void *key)
{
int error;
- SORT_INFO *sort_info=sort_param->sort_info;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
MI_INFO *info=sort_info->info;
FT_WORD *wptr=0;
DBUG_ENTER("sort_ft_key_read");
@@ -3123,8 +3126,8 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
my_off_t pos;
uchar *to;
MI_BLOCK_INFO block_info;
- SORT_INFO *sort_info=sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
MI_INFO *info=sort_info->info;
MYISAM_SHARE *share=info->s;
char llbuff[22],llbuff2[22];
@@ -3134,6 +3137,9 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
DBUG_RETURN(1);
switch (share->data_file_type) {
+ case BLOCK_RECORD:
+ DBUG_ASSERT(0); /* Impossible */
+ break;
case STATIC_RECORD:
for (;;)
{
@@ -3549,8 +3555,8 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
ulong block_length,reclength;
uchar *from;
uchar block_buff[8];
- SORT_INFO *sort_info=sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
MI_INFO *info=sort_info->info;
MYISAM_SHARE *share=info->s;
DBUG_ENTER("sort_write_record");
@@ -3558,6 +3564,9 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
if (sort_param->fix_datafile)
{
switch (sort_info->new_data_file_type) {
+ case BLOCK_RECORD:
+ DBUG_ASSERT(0); /* Impossible */
+ break;
case STATIC_RECORD:
if (my_b_write(&info->rec_cache,sort_param->record,
share->base.pack_reclength))
@@ -3576,7 +3585,7 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
{
/* must be sure that local buffer is big enough */
reclength=info->s->base.pack_reclength+
- _my_calc_total_blob_length(info,sort_param->record)+
+ _mi_calc_total_blob_length(info,sort_param->record)+
ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
MI_DYN_DELETE_BLOCK_HEADER;
if (sort_info->buff_length < reclength)
@@ -3665,24 +3674,25 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
{
uint diff_pos[2];
char llbuff[22],llbuff2[22];
- SORT_INFO *sort_info=sort_param->sort_info;
- MI_CHECK *param= sort_info->param;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param= sort_info->param;
int cmp;
if (sort_info->key_block->inited)
{
- cmp=ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey,
+ cmp=ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
(uchar*) a, USE_WHOLE_KEY,SEARCH_FIND | SEARCH_UPDATE,
diff_pos);
if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
- ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey,
+ ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
(uchar*) a, USE_WHOLE_KEY,
SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos);
else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
{
diff_pos[0]= mi_collect_stats_nonulls_next(sort_param->seg,
sort_param->notnull,
- sort_info->key_block->lastkey,
+ (uchar*) sort_info->
+ key_block->lastkey,
(uchar*)a);
}
sort_param->unique[diff_pos[0]-1]++;
@@ -3705,8 +3715,8 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
llstr(sort_info->info->lastpos,llbuff),
llstr(get_record_for_key(sort_info->info,
sort_param->keyinfo,
- sort_info->key_block->
- lastkey),
+ (uchar*) sort_info->
+ key_block->lastkey),
llbuff2));
param->testflag|=T_RETRY_WITHOUT_QUICK;
if (sort_info->param->testflag & T_VERBOSE)
@@ -3727,7 +3737,7 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
{
- SORT_INFO *sort_info=sort_param->sort_info;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
SORT_KEY_BLOCKS *key_block=sort_info->key_block;
MYISAM_SHARE *share=sort_info->info->s;
uint val_off, val_len;
@@ -3737,19 +3747,19 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
val_len=share->ft2_keyinfo.keylength;
get_key_full_length_rdonly(val_off, ft_buf->lastkey);
- to=ft_buf->lastkey+val_off;
+ to= (uchar*) ft_buf->lastkey+val_off;
if (ft_buf->buf)
{
/* flushing first-level tree */
- error=sort_insert_key(sort_param,key_block,ft_buf->lastkey,
+ error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey,
HA_OFFSET_ERROR);
for (from=to+val_len;
- !error && from < ft_buf->buf;
+ !error && from < (uchar*) ft_buf->buf;
from+= val_len)
{
memcpy(to, from, val_len);
- error=sort_insert_key(sort_param,key_block,ft_buf->lastkey,
+ error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey,
HA_OFFSET_ERROR);
}
return error;
@@ -3758,8 +3768,8 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
error=flush_pending_blocks(sort_param);
/* updating lastkey with second-level tree info */
ft_intXstore(ft_buf->lastkey+val_off, -ft_buf->count);
- _mi_dpointer(sort_info->info, ft_buf->lastkey+val_off+HA_FT_WLEN,
- share->state.key_root[sort_param->key]);
+ _mi_dpointer(sort_info->info, (uchar*) ft_buf->lastkey+val_off+HA_FT_WLEN,
+ share->state.key_root[sort_param->key]);
/* restoring first level tree data in sort_info/sort_param */
sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks;
sort_param->keyinfo=share->keyinfo+sort_param->key;
@@ -3767,14 +3777,14 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
/* writing lastkey in first-level tree */
return error ? error :
sort_insert_key(sort_param,sort_info->key_block,
- ft_buf->lastkey,HA_OFFSET_ERROR);
+ (uchar*) ft_buf->lastkey,HA_OFFSET_ERROR);
}
static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
{
uint a_len, val_off, val_len, error;
uchar *p;
- SORT_INFO *sort_info=sort_param->sort_info;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
SORT_FT_BUF *ft_buf=sort_info->ft_buf;
SORT_KEY_BLOCKS *key_block=sort_info->key_block;
@@ -3804,9 +3814,9 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
}
get_key_full_length_rdonly(val_off, ft_buf->lastkey);
- if (mi_compare_text(sort_param->seg->charset,
+ if (ha_compare_text(sort_param->seg->charset,
((uchar *)a)+1,a_len-1,
- ft_buf->lastkey+1,val_off-1, 0, 0)==0)
+ (uchar*) ft_buf->lastkey+1,val_off-1, 0, 0)==0)
{
if (!ft_buf->buf) /* store in second-level tree */
{
@@ -3822,16 +3832,16 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
return 0;
/* converting to two-level tree */
- p=ft_buf->lastkey+val_off;
+ p= (uchar*) ft_buf->lastkey+val_off;
while (key_block->inited)
key_block++;
sort_info->key_block=key_block;
sort_param->keyinfo=& sort_info->info->s->ft2_keyinfo;
- ft_buf->count=(ft_buf->buf - p)/val_len;
+ ft_buf->count=((uchar*) ft_buf->buf - p)/val_len;
/* flushing buffer to second-level tree */
- for (error=0; !error && p < ft_buf->buf; p+= val_len)
+ for (error=0; !error && p < (uchar*) ft_buf->buf; p+= val_len)
error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR);
ft_buf->buf=0;
return error;
@@ -3879,13 +3889,13 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
MI_KEY_PARAM s_temp;
MI_INFO *info;
MI_KEYDEF *keyinfo=sort_param->keyinfo;
- SORT_INFO *sort_info= sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
DBUG_ENTER("sort_insert_key");
- anc_buff=key_block->buff;
+ anc_buff= (uchar*) key_block->buff;
info=sort_info->info;
- lastkey=key_block->lastkey;
+ lastkey= (uchar*) key_block->lastkey;
nod_flag= (key_block == sort_info->key_block ? 0 :
info->s->base.key_reflength);
@@ -3898,7 +3908,7 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
DBUG_RETURN(1);
}
a_length=2+nod_flag;
- key_block->end_pos=anc_buff+2;
+ key_block->end_pos= (char*) anc_buff+2;
lastkey=0; /* No previous key in block */
}
else
@@ -3906,18 +3916,18 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
/* Save pointer to previous block */
if (nod_flag)
- _mi_kpointer(info,key_block->end_pos,prev_block);
+ _mi_kpointer(info,(uchar*) key_block->end_pos,prev_block);
t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,
(uchar*) 0,lastkey,lastkey,key,
&s_temp);
- (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp);
+ (*keyinfo->store_key)(keyinfo, (uchar*) key_block->end_pos+nod_flag,&s_temp);
a_length+=t_length;
mi_putint(anc_buff,a_length,nod_flag);
key_block->end_pos+=t_length;
if (a_length <= keyinfo->block_length)
{
- VOID(_mi_move_key(keyinfo,key_block->lastkey,key));
+ VOID(_mi_move_key(keyinfo,(uchar*) key_block->lastkey,key));
key_block->last_length=a_length-t_length;
DBUG_RETURN(0);
}
@@ -3942,7 +3952,8 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
DBUG_DUMP("buff",(uchar*) anc_buff,mi_getint(anc_buff));
/* Write separator-key to block in next level */
- if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos))
+ if (sort_insert_key(sort_param,key_block+1,(uchar*) key_block->lastkey,
+ filepos))
DBUG_RETURN(1);
/* clear old block and write new key in it */
@@ -3958,8 +3969,8 @@ static int sort_delete_record(MI_SORT_PARAM *sort_param)
uint i;
int old_file,error;
uchar *key;
- SORT_INFO *sort_info=sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
MI_INFO *info=sort_info->info;
DBUG_ENTER("sort_delete_record");
@@ -4015,7 +4026,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
uint nod_flag,length;
my_off_t filepos,key_file_length;
SORT_KEY_BLOCKS *key_block;
- SORT_INFO *sort_info= sort_param->sort_info;
+ MI_SORT_INFO *sort_info= sort_param->sort_info;
myf myf_rw=sort_info->param->myf_rw;
MI_INFO *info=sort_info->info;
MI_KEYDEF *keyinfo=sort_param->keyinfo;
@@ -4028,7 +4039,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
key_block->inited=0;
length=mi_getint(key_block->buff);
if (nod_flag)
- _mi_kpointer(info,key_block->end_pos,filepos);
+ _mi_kpointer(info,(uchar*) key_block->end_pos,filepos);
key_file_length=info->state->key_file_length;
bzero((uchar*) key_block->buff+length, keyinfo->block_length-length);
if ((filepos=_mi_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
@@ -4038,7 +4049,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
if (key_file_length == info->state->key_file_length)
{
if (_mi_write_keypage(info, keyinfo, filepos,
- DFLT_INIT_HITS, key_block->buff))
+ DFLT_INIT_HITS, (uchar*) key_block->buff))
DBUG_RETURN(1);
}
else if (my_pwrite(info->s->kfile,(uchar*) key_block->buff,
@@ -4053,7 +4064,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
/* alloc space and pointers for key_blocks */
-static SORT_KEY_BLOCKS *alloc_key_blocks(MI_CHECK *param, uint blocks,
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
uint buffer_length)
{
reg1 uint i;
@@ -4090,7 +4101,7 @@ int test_if_almost_full(MI_INFO *info)
/* Recreate table with bigger more alloced record-data */
-int recreate_table(MI_CHECK *param, MI_INFO **org_info, char *filename)
+int recreate_table(HA_CHECK *param, MI_INFO **org_info, char *filename)
{
int error;
MI_INFO info;
@@ -4263,7 +4274,7 @@ end:
/* write suffix to data file if neaded */
-int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile)
+int write_data_suffix(MI_SORT_INFO *sort_info, my_bool fix_datafile)
{
MI_INFO *info=sort_info->info;
@@ -4284,7 +4295,7 @@ int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile)
/* Update state and myisamchk_time of indexfile */
-int update_state_info(MI_CHECK *param, MI_INFO *info,uint update)
+int update_state_info(HA_CHECK *param, MI_INFO *info,uint update)
{
MYISAM_SHARE *share=info->s;
@@ -4356,7 +4367,7 @@ err:
param->auto_increment is bigger than the biggest key.
*/
-void update_auto_increment_key(MI_CHECK *param, MI_INFO *info,
+void update_auto_increment_key(HA_CHECK *param, MI_INFO *info,
my_bool repair_only)
{
uchar *record;
@@ -4589,7 +4600,7 @@ my_bool mi_test_if_sort_rep(MI_INFO *info, ha_rows rows,
static void
-set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share)
+set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share)
{
if ((sort_info->new_data_file_type=share->data_file_type) ==
COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK)
diff --git a/storage/myisam/mi_checksum.c b/storage/myisam/mi_checksum.c
index 4e87de373bd..1aa56e571e3 100644
--- a/storage/myisam/mi_checksum.c
+++ b/storage/myisam/mi_checksum.c
@@ -31,9 +31,9 @@ ha_checksum mi_checksum(MI_INFO *info, const uchar *buf)
case FIELD_BLOB:
{
length=_mi_calc_blob_length(rec->length-
- mi_portable_sizeof_char_ptr,
+ portable_sizeof_char_ptr,
buf);
- memcpy((char*) &pos, buf+rec->length- mi_portable_sizeof_char_ptr,
+ memcpy((char*) &pos, buf+rec->length- portable_sizeof_char_ptr,
sizeof(char*));
break;
}
diff --git a/storage/myisam/mi_close.c b/storage/myisam/mi_close.c
index 07105aea88d..747555dbdfb 100644
--- a/storage/myisam/mi_close.c
+++ b/storage/myisam/mi_close.c
@@ -75,6 +75,7 @@ int mi_close(register MI_INFO *info)
not change the crashed state.
We can NOT write the state in other cases as other threads
may be using the file at this point
+ IF using --external-locking.
*/
if (share->mode != O_RDONLY && mi_is_crashed(info))
mi_state_info_write(share->kfile, &share->state, 1);
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index 0cac5f08b3b..fc5b31e7689 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -17,6 +17,7 @@
#include "ftdefs.h"
#include "sp_defs.h"
+#include <my_bit.h>
#if defined(MSDOS) || defined(__WIN__)
#ifdef __WIN__
@@ -40,11 +41,11 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
File dfile,file;
int errpos,save_errno, create_mode= O_RDWR | O_TRUNC;
myf create_flag;
- uint fields,length,max_key_length,packed,pointer,real_length_diff,
+ uint fields,length,max_key_length,packed,pack_bytes,pointer,real_length_diff,
key_length,info_length,key_segs,options,min_key_length_skip,
base_pos,long_varchar_count,varchar_length,
max_key_block_length,unique_key_parts,fulltext_keys,offset;
- uint aligned_key_start, block_length;
+ uint aligned_key_start, block_length, res;
ulong reclength, real_reclength,min_pack_length;
char filename[FN_REFLEN],linkname[FN_REFLEN], *linkname_ptr;
ulong pack_reclength;
@@ -56,7 +57,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
HA_KEYSEG *keyseg,tmp_keyseg;
MI_COLUMNDEF *rec;
ulong *rec_per_key_part;
- my_off_t key_root[MI_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE];
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE];
MI_CREATE_INFO tmp_create_info;
DBUG_ENTER("mi_create");
DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u",
@@ -94,7 +95,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */
if (!(rec_per_key_part=
- (ulong*) my_malloc((keys + uniques)*MI_MAX_KEY_SEG*sizeof(long),
+ (ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long),
MYF(MY_WME | MY_ZEROFILL))))
DBUG_RETURN(my_errno);
@@ -116,10 +117,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
share.base.blobs++;
if (pack_reclength != INT_MAX32)
{
- if (rec->length == 4+mi_portable_sizeof_char_ptr)
+ if (rec->length == 4+portable_sizeof_char_ptr)
pack_reclength= INT_MAX32;
else
- pack_reclength+=(1 << ((rec->length-mi_portable_sizeof_char_ptr)*8)); /* Max blob length */
+ pack_reclength+=(1 << ((rec->length-portable_sizeof_char_ptr)*8)); /* Max blob length */
}
}
else if (type == FIELD_SKIP_PRESPACE ||
@@ -192,11 +193,11 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
if (flags & HA_CREATE_RELIES_ON_SQL_LAYER)
options|= HA_OPTION_RELIES_ON_SQL_LAYER;
- packed=(packed+7)/8;
+ pack_bytes= (packed+7)/8;
if (pack_reclength != INT_MAX32)
pack_reclength+= reclength+packed +
test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_PACK_RECORD));
- min_pack_length+=packed;
+ min_pack_length+= pack_bytes;
if (!ci->data_file_length && ci->max_rows)
{
@@ -273,7 +274,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
keyseg->type != HA_KEYTYPE_VARBINARY2)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
}
keydef->keysegs+=sp_segs;
@@ -282,7 +283,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
min_key_length_skip+=SPLEN*2*SPDIMS;
#else
my_errno= HA_ERR_UNSUPPORTED;
- goto err;
+ goto err_no_lock;
#endif /*HAVE_SPATIAL*/
}
else if (keydef->flag & HA_FULLTEXT)
@@ -298,7 +299,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
keyseg->type != HA_KEYTYPE_VARTEXT2)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
if (!(keyseg->flag & HA_BLOB_PART) &&
(keyseg->type == HA_KEYTYPE_VARTEXT1 ||
@@ -420,10 +421,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
}
} /* if HA_FULLTEXT */
key_segs+=keydef->keysegs;
- if (keydef->keysegs > MI_MAX_KEY_SEG)
+ if (keydef->keysegs > HA_MAX_KEY_SEG)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
/*
key_segs may be 0 in the case when we only want to be able to
@@ -435,7 +436,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
share.state.rec_per_key_part[key_segs-1]=1L;
length+=key_length;
/* Get block length for key, if defined by user */
- block_length= (keydef->block_length ?
+ block_length= (keydef->block_length ?
my_round_up_to_next_power(keydef->block_length) :
myisam_block_size);
block_length= max(block_length, MI_MIN_KEY_BLOCK_LENGTH);
@@ -445,10 +446,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
pointer,MI_MAX_KEYPTR_SIZE,
block_length);
if (keydef->block_length > MI_MAX_KEY_BLOCK_LENGTH ||
- length >= MI_MAX_KEY_BUFF)
+ length >= HA_MAX_KEY_BUFF)
{
my_errno=HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
set_if_bigger(max_key_block_length,keydef->block_length);
keydef->keylength= (uint16) key_length;
@@ -495,7 +496,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
"indexes and/or unique constraints.",
MYF(0), name + dirname_length(name));
my_errno= HA_WRONG_CREATE_OPTION;
- goto err;
+ goto err_no_lock;
}
bmove(share.state.header.file_version,(uchar*) myisam_file_magic,4);
@@ -550,9 +551,9 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM);
share.base.max_pack_length=pack_reclength;
share.base.min_pack_length=min_pack_length;
- share.base.pack_bits=packed;
+ share.base.pack_bits= pack_bytes;
share.base.fields=fields;
- share.base.pack_fields=packed;
+ share.base.pack_fields= packed;
#ifdef USE_RAID
share.base.raid_type=ci->raid_type;
share.base.raid_chunks=ci->raid_chunks;
@@ -826,13 +827,16 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
}
errpos=0;
pthread_mutex_unlock(&THR_LOCK_myisam);
+ res= 0;
if (my_close(file,MYF(0)))
- goto err;
+ res= my_errno;
my_free((char*) rec_per_key_part,MYF(0));
- DBUG_RETURN(0);
+ DBUG_RETURN(res);
err:
pthread_mutex_unlock(&THR_LOCK_myisam);
+err_no_lock:
+
save_errno=my_errno;
switch (errpos) {
case 3:
diff --git a/storage/myisam/mi_dbug.c b/storage/myisam/mi_dbug.c
index 07c314c43e6..0808a7e85dd 100644
--- a/storage/myisam/mi_dbug.c
+++ b/storage/myisam/mi_dbug.c
@@ -45,6 +45,7 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
fprintf(stream,"NULL");
continue;
}
+ end++;
}
switch (keyseg->type) {
diff --git a/storage/myisam/mi_delete.c b/storage/myisam/mi_delete.c
index 6fe31f30c19..88b31d616de 100644
--- a/storage/myisam/mi_delete.c
+++ b/storage/myisam/mi_delete.c
@@ -159,7 +159,7 @@ static int _mi_ck_real_delete(register MI_INFO *info, MI_KEYDEF *keyinfo,
DBUG_RETURN(my_errno=HA_ERR_CRASHED);
}
if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
- MI_MAX_KEY_BUFF*2)))
+ HA_MAX_KEY_BUFF*2)))
{
DBUG_PRINT("error",("Couldn't allocate memory"));
DBUG_RETURN(my_errno=ENOMEM);
@@ -221,7 +221,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
my_bool last_key;
uchar *leaf_buff,*keypos;
my_off_t leaf_page,next_block;
- uchar lastkey[MI_MAX_KEY_BUFF];
+ uchar lastkey[HA_MAX_KEY_BUFF];
DBUG_ENTER("d_search");
DBUG_DUMP("page",(uchar*) anc_buff,mi_getint(anc_buff));
@@ -306,7 +306,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
{
leaf_page=_mi_kpos(nod_flag,keypos);
if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
- MI_MAX_KEY_BUFF*2)))
+ HA_MAX_KEY_BUFF*2)))
{
DBUG_PRINT("error",("Couldn't allocate memory"));
my_errno=ENOMEM;
@@ -365,9 +365,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
{ /* This happens only with packed keys */
DBUG_PRINT("test",("Enlarging of key when deleting"));
if (!_mi_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length))
- {
goto err;
- }
ret_value=_mi_insert(info,keyinfo,key,anc_buff,keypos,lastkey,
(uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0);
}
@@ -405,7 +403,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key,
int ret_value,length;
uint a_length,nod_flag,tmp;
my_off_t next_page;
- uchar keybuff[MI_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+ uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
MYISAM_SHARE *share=info->s;
MI_KEY_PARAM s_temp;
DBUG_ENTER("del");
@@ -422,7 +420,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key,
{
next_page= _mi_kpos(nod_flag,endpos);
if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
- MI_MAX_KEY_BUFF*2)))
+ HA_MAX_KEY_BUFF*2)))
DBUG_RETURN(-1);
if (!_mi_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0))
ret_value= -1;
@@ -509,7 +507,7 @@ static int underflow(register MI_INFO *info, register MI_KEYDEF *keyinfo,
uint length,anc_length,buff_length,leaf_length,p_length,s_length,nod_flag,
key_reflength,key_length;
my_off_t next_page;
- uchar anc_key[MI_MAX_KEY_BUFF],leaf_key[MI_MAX_KEY_BUFF],
+ uchar anc_key[HA_MAX_KEY_BUFF],leaf_key[HA_MAX_KEY_BUFF],
*buff,*endpos,*next_keypos,*anc_pos,*half_pos,*temp_pos,*prev_key,
*after_key;
MI_KEY_PARAM s_temp;
diff --git a/storage/myisam/mi_dynrec.c b/storage/myisam/mi_dynrec.c
index cdd70abe9ad..2a12fd04641 100644
--- a/storage/myisam/mi_dynrec.c
+++ b/storage/myisam/mi_dynrec.c
@@ -252,7 +252,7 @@ int _mi_write_blob_record(MI_INFO *info, const uchar *record)
extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
MI_DYN_DELETE_BLOCK_HEADER+1);
reclength= (info->s->base.pack_reclength +
- _my_calc_total_blob_length(info,record)+ extra);
+ _mi_calc_total_blob_length(info,record)+ extra);
#ifdef NOT_USED /* We now support big rows */
if (reclength > MI_DYN_MAX_ROW_LENGTH)
{
@@ -286,7 +286,7 @@ int _mi_update_blob_record(MI_INFO *info, my_off_t pos, const uchar *record)
extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
MI_DYN_DELETE_BLOCK_HEADER);
reclength= (info->s->base.pack_reclength+
- _my_calc_total_blob_length(info,record)+ extra);
+ _mi_calc_total_blob_length(info,record)+ extra);
#ifdef NOT_USED /* We now support big rows */
if (reclength > MI_DYN_MAX_ROW_LENGTH)
{
@@ -901,7 +901,7 @@ uint _mi_rec_pack(MI_INFO *info, register uchar *to,
else
{
char *temp_pos;
- size_t tmp_length=length-mi_portable_sizeof_char_ptr;
+ size_t tmp_length=length-portable_sizeof_char_ptr;
memcpy((uchar*) to,from,tmp_length);
memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*));
memcpy(to+tmp_length,temp_pos,(size_t) blob->length);
@@ -1022,11 +1022,11 @@ my_bool _mi_rec_check(MI_INFO *info,const uchar *record, uchar *rec_buff,
if (type == FIELD_BLOB)
{
uint blob_length=
- _mi_calc_blob_length(length-mi_portable_sizeof_char_ptr,record);
+ _mi_calc_blob_length(length-portable_sizeof_char_ptr,record);
if (!blob_length && !(flag & bit))
goto err;
if (blob_length)
- to+=length - mi_portable_sizeof_char_ptr+ blob_length;
+ to+=length - portable_sizeof_char_ptr+ blob_length;
}
else if (type == FIELD_SKIP_ZERO)
{
@@ -1209,7 +1209,7 @@ ulong _mi_rec_unpack(register MI_INFO *info, register uchar *to, uchar *from,
}
else if (type == FIELD_BLOB)
{
- uint size_length=rec_length- mi_portable_sizeof_char_ptr;
+ uint size_length=rec_length- portable_sizeof_char_ptr;
ulong blob_length=_mi_calc_blob_length(size_length,from);
ulong from_left= (ulong) (from_end - from);
if (from_left < size_length ||
@@ -1259,7 +1259,7 @@ err:
/* Calc length of blob. Update info in blobs->length */
-ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record)
+ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record)
{
ulong length;
MI_BLOB *blob,*end;
@@ -1293,7 +1293,7 @@ ulong _mi_calc_blob_length(uint length, const uchar *pos)
}
-void _my_store_blob_length(uchar *pos,uint pack_length,uint length)
+void _mi_store_blob_length(uchar *pos,uint pack_length,uint length)
{
switch (pack_length) {
case 1:
@@ -1506,7 +1506,7 @@ int _mi_cmp_dynamic_record(register MI_INFO *info, register const uchar *record)
if (info->s->base.blobs)
{
if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+
- _my_calc_total_blob_length(info,record))))
+ _mi_calc_total_blob_length(info,record))))
DBUG_RETURN(-1);
}
reclength=_mi_rec_pack(info,buffer,record);
diff --git a/storage/myisam/mi_extra.c b/storage/myisam/mi_extra.c
index 1b4c79d13de..33c9d1210ca 100644
--- a/storage/myisam/mi_extra.c
+++ b/storage/myisam/mi_extra.c
@@ -256,15 +256,16 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
share->last_version= 0L; /* Impossible version */
pthread_mutex_unlock(&THR_LOCK_myisam);
break;
- case HA_EXTRA_PREPARE_FOR_DELETE:
+ case HA_EXTRA_PREPARE_FOR_RENAME:
+ case HA_EXTRA_PREPARE_FOR_DROP:
pthread_mutex_lock(&THR_LOCK_myisam);
share->last_version= 0L; /* Impossible version */
#ifdef __WIN__REMOVE_OBSOLETE_WORKAROUND
/* Close the isam and data files as Win32 can't drop an open table */
pthread_mutex_lock(&share->intern_lock);
if (flush_key_blocks(share->key_cache, share->kfile,
- (function == HA_EXTRA_FORCE_REOPEN ?
- FLUSH_RELEASE : FLUSH_IGNORE_CHANGED)))
+ (function == HA_EXTRA_PREPARE_FOR_DROP ?
+ FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)))
{
error=my_errno;
share->changed=1;
diff --git a/storage/myisam/mi_key.c b/storage/myisam/mi_key.c
index 3f445ebf44d..94f3f34ec58 100644
--- a/storage/myisam/mi_key.c
+++ b/storage/myisam/mi_key.c
@@ -426,7 +426,7 @@ static int _mi_put_key_in_record(register MI_INFO *info, uint keynr,
/* The above changed info->lastkey2. Inform mi_rnext_same(). */
info->update&= ~HA_STATE_RNEXT_SAME;
- _my_store_blob_length(record+keyseg->start,
+ _mi_store_blob_length(record+keyseg->start,
(uint) keyseg->bit_start,length);
key+=length;
}
diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c
index ec359d13a14..2d89fce2a81 100644
--- a/storage/myisam/mi_locking.c
+++ b/storage/myisam/mi_locking.c
@@ -56,9 +56,15 @@ int mi_lock_database(MI_INFO *info, int lock_type)
case F_UNLCK:
ftparser_call_deinitializer(info);
if (info->lock_type == F_RDLCK)
+ {
count= --share->r_locks;
+ mi_restore_status(info);
+ }
else
+ {
count= --share->w_locks;
+ mi_update_status(info);
+ }
--share->tot_locks;
if (info->lock_type == F_WRLCK && !share->w_locks &&
!share->delay_key_write && flush_key_blocks(share->key_cache,
@@ -84,16 +90,16 @@ int mi_lock_database(MI_INFO *info, int lock_type)
if (share->changed && !share->w_locks)
{
#ifdef HAVE_MMAP
- if ((info->s->mmaped_length != info->s->state.state.data_file_length) &&
- (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
- {
- if (info->s->concurrent_insert)
- rw_wrlock(&info->s->mmap_lock);
- mi_remap_file(info, info->s->state.state.data_file_length);
- info->s->nonmmaped_inserts= 0;
- if (info->s->concurrent_insert)
- rw_unlock(&info->s->mmap_lock);
- }
+ if ((info->s->mmaped_length != info->s->state.state.data_file_length) &&
+ (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+ {
+ if (info->s->concurrent_insert)
+ rw_wrlock(&info->s->mmap_lock);
+ mi_remap_file(info, info->s->state.state.data_file_length);
+ info->s->nonmmaped_inserts= 0;
+ if (info->s->concurrent_insert)
+ rw_unlock(&info->s->mmap_lock);
+ }
#endif
share->state.process= share->last_process=share->this_process;
share->state.unique= info->last_unique= info->this_unique;
@@ -300,6 +306,7 @@ void mi_get_status(void* param, int concurrent_insert)
void mi_update_status(void* param)
{
MI_INFO *info=(MI_INFO*) param;
+ DBUG_ENTER("mi_update_status");
/*
Because someone may have closed the table we point at, we only
update the state if its our own state. This isn't a problem as
@@ -336,20 +343,32 @@ void mi_update_status(void* param)
}
info->opt_flag&= ~WRITE_CACHE_USED;
}
+ DBUG_VOID_RETURN;
}
void mi_restore_status(void *param)
{
MI_INFO *info= (MI_INFO*) param;
+ DBUG_ENTER("mi_restore_status");
+ DBUG_PRINT("info",("key_file: %ld data_file: %ld",
+ (long) info->s->state.state.key_file_length,
+ (long) info->s->state.state.data_file_length));
info->state= &info->s->state.state;
info->append_insert_at_end= 0;
+ DBUG_VOID_RETURN;
}
void mi_copy_status(void* to,void *from)
{
- ((MI_INFO*) to)->state= &((MI_INFO*) from)->save_state;
+ MI_INFO *info= (MI_INFO*) to;
+ DBUG_ENTER("mi_copy_status");
+ info->state= &((MI_INFO*) from)->save_state;
+ DBUG_PRINT("info",("key_file: %ld data_file: %ld",
+ (long) info->state->key_file_length,
+ (long) info->state->data_file_length));
+ DBUG_VOID_RETURN;
}
@@ -377,17 +396,18 @@ void mi_copy_status(void* to,void *from)
my_bool mi_check_status(void *param)
{
MI_INFO *info=(MI_INFO*) param;
+ DBUG_ENTER("mi_check_status");
+ DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u",
+ (long) info->s->state.dellink, (uint) info->s->r_locks,
+ (uint) info->s->w_locks));
/*
The test for w_locks == 1 is here because this thread has already done an
external lock (in other words: w_locks == 1 means no other threads has
a write lock)
*/
- DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u",
- (long) info->s->state.dellink, (uint) info->s->r_locks,
- (uint) info->s->w_locks));
- return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
+ DBUG_RETURN((my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
(myisam_concurrent_insert == 2 && info->s->r_locks &&
- info->s->w_locks == 1));
+ info->s->w_locks == 1)));
}
diff --git a/storage/myisam/mi_log.c b/storage/myisam/mi_log.c
index 8b9ca038fec..982ba8b4367 100644
--- a/storage/myisam/mi_log.c
+++ b/storage/myisam/mi_log.c
@@ -133,7 +133,7 @@ void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info,
if (!info->s->base.blobs)
length=info->s->base.reclength;
else
- length=info->s->base.reclength+ _my_calc_total_blob_length(info,record);
+ length=info->s->base.reclength+ _mi_calc_total_blob_length(info,record);
buff[0]=(uchar) command;
mi_int2store(buff+1,info->dfile);
mi_int4store(buff+3,pid);
diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c
index b848c822f75..b0cc2e54ca7 100644
--- a/storage/myisam/mi_open.c
+++ b/storage/myisam/mi_open.c
@@ -82,8 +82,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
uchar *disk_cache, *disk_pos, *end_pos;
MI_INFO info,*m_info,*old_info;
MYISAM_SHARE share_buff,*share;
- ulong rec_per_key_part[MI_MAX_POSSIBLE_KEY*MI_MAX_KEY_SEG];
- my_off_t key_root[MI_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE];
+ ulong rec_per_key_part[HA_MAX_POSSIBLE_KEY*HA_MAX_KEY_SEG];
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE];
ulonglong max_key_file_length, max_data_file_length;
DBUG_ENTER("mi_open");
@@ -105,7 +105,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
share_buff.state.key_root=key_root;
share_buff.state.key_del=key_del;
share_buff.key_cache= multi_key_cache_search((uchar*) name_buff,
- strlen(name_buff));
+ strlen(name_buff),
+ dflt_key_cache);
DBUG_EXECUTE_IF("myisam_pretend_crashed_table_on_open",
if (strstr(name, "/t1"))
@@ -210,7 +211,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d",
len,MI_BASE_INFO_SIZE));
}
- disk_pos= my_n_base_info_read(disk_cache + base_pos, &share->base);
+ disk_pos= mi_n_base_info_read(disk_cache + base_pos, &share->base);
share->state.state_length=base_pos;
if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
@@ -235,8 +236,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
}
key_parts+=fulltext_keys*FT_SEGS;
- if (share->base.max_key_length > MI_MAX_KEY_BUFF || keys > MI_MAX_KEY ||
- key_parts > MI_MAX_KEY * MI_MAX_KEY_SEG)
+ if (share->base.max_key_length > HA_MAX_KEY_BUFF || keys > MI_MAX_KEY ||
+ key_parts > MI_MAX_KEY * HA_MAX_KEY_SEG)
{
DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts));
my_errno=HA_ERR_UNSUPPORTED;
@@ -452,7 +453,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
if (share->rec[i].type == (int) FIELD_BLOB)
{
share->blobs[j].pack_length=
- share->rec[i].length-mi_portable_sizeof_char_ptr;;
+ share->rec[i].length-portable_sizeof_char_ptr;;
share->blobs[j].offset=offset;
j++;
}
@@ -1017,7 +1018,7 @@ uint mi_base_info_write(File file, MI_BASE_INFO *base)
}
-uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base)
+uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base)
{
base->keystart = mi_sizekorr(ptr); ptr +=8;
base->max_data_file_length = mi_sizekorr(ptr); ptr +=8;
diff --git a/storage/myisam/mi_packrec.c b/storage/myisam/mi_packrec.c
index 305b7e5532c..a1bfb9e2c9b 100644
--- a/storage/myisam/mi_packrec.c
+++ b/storage/myisam/mi_packrec.c
@@ -105,6 +105,7 @@ static void init_bit_buffer(MI_BIT_BUFF *bit_buff,uchar *buffer,uint length);
static uint fill_and_get_bits(MI_BIT_BUFF *bit_buff,uint count);
static void fill_buffer(MI_BIT_BUFF *bit_buff);
static uint max_bit(uint value);
+static uint read_pack_length(uint version, const uchar *buf, ulong *length);
#ifdef HAVE_MMAP
static uchar *_mi_mempack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
MI_BLOCK_INFO *info, uchar **rec_buff_p,
@@ -1036,7 +1037,7 @@ static void uf_blob(MI_COLUMNDEF *rec, MI_BIT_BUFF *bit_buff,
else
{
ulong length=get_bits(bit_buff,rec->space_length_bits);
- uint pack_length=(uint) (end-to)-mi_portable_sizeof_char_ptr;
+ uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr;
if (bit_buff->blob_pos+length > bit_buff->blob_end)
{
bit_buff->error=1;
@@ -1044,7 +1045,7 @@ static void uf_blob(MI_COLUMNDEF *rec, MI_BIT_BUFF *bit_buff,
return;
}
decode_bytes(rec,bit_buff,bit_buff->blob_pos,bit_buff->blob_pos+length);
- _my_store_blob_length((uchar*) to,pack_length,length);
+ _mi_store_blob_length((uchar*) to,pack_length,length);
memcpy_fixed((char*) to+pack_length,(char*) &bit_buff->blob_pos,
sizeof(char*));
bit_buff->blob_pos+=length;
@@ -1625,7 +1626,7 @@ uint save_pack_length(uint version, uchar *block_buff, ulong length)
}
-uint read_pack_length(uint version, const uchar *buf, ulong *length)
+static uint read_pack_length(uint version, const uchar *buf, ulong *length)
{
if (buf[0] < 254)
{
diff --git a/storage/myisam/mi_range.c b/storage/myisam/mi_range.c
index 932a4abd1b3..8bd122c828a 100644
--- a/storage/myisam/mi_range.c
+++ b/storage/myisam/mi_range.c
@@ -260,7 +260,7 @@ static uint _mi_keynr(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
uchar *keypos, uint *ret_max_key)
{
uint nod_flag,keynr,max_key;
- uchar t_buff[MI_MAX_KEY_BUFF],*end;
+ uchar t_buff[HA_MAX_KEY_BUFF],*end;
end= page+mi_getint(page);
nod_flag=mi_test_if_nod(page);
diff --git a/storage/myisam/mi_rkey.c b/storage/myisam/mi_rkey.c
index f1d35810d36..f20b0366683 100644
--- a/storage/myisam/mi_rkey.c
+++ b/storage/myisam/mi_rkey.c
@@ -85,6 +85,8 @@ int mi_rkey(MI_INFO *info, uchar *buf, int inx, const uchar *key,
{
mi_print_error(info->s, HA_ERR_CRASHED);
my_errno=HA_ERR_CRASHED;
+ if (share->concurrent_insert)
+ rw_unlock(&share->key_root_lock[inx]);
goto err;
}
break;
diff --git a/storage/myisam/mi_search.c b/storage/myisam/mi_search.c
index 2195ac178dd..f4cac27a43f 100644
--- a/storage/myisam/mi_search.c
+++ b/storage/myisam/mi_search.c
@@ -60,7 +60,7 @@ int _mi_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
int error,flag;
uint nod_flag;
uchar *keypos,*maxpos;
- uchar lastkey[MI_MAX_KEY_BUFF],*buff;
+ uchar lastkey[HA_MAX_KEY_BUFF],*buff;
DBUG_ENTER("_mi_search");
DBUG_PRINT("enter",("pos: %lu nextflag: %u lastpos: %lu",
(ulong) pos, nextflag, (ulong) info->lastpos));
@@ -242,7 +242,7 @@ int _mi_seq_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
{
int flag;
uint nod_flag,length,not_used[2];
- uchar t_buff[MI_MAX_KEY_BUFF],*end;
+ uchar t_buff[HA_MAX_KEY_BUFF],*end;
DBUG_ENTER("_mi_seq_search");
LINT_INIT(flag); LINT_INIT(length);
@@ -296,7 +296,7 @@ int _mi_prefix_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
int key_len_skip, seg_len_pack, key_len_left;
uchar *end, *kseg, *vseg;
uchar *sort_order=keyinfo->seg->charset->sort_order;
- uchar tt_buff[MI_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
+ uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
uchar *saved_from, *saved_to, *saved_vseg;
uint saved_length=0, saved_prefix_len=0;
uint length_pack;
@@ -920,7 +920,7 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag,
DBUG_ENTER("_mi_get_binary_pack_key");
page= *page_pos;
- page_end=page+MI_MAX_KEY_BUFF+1;
+ page_end=page+HA_MAX_KEY_BUFF+1;
start_key=key;
/*
@@ -1238,7 +1238,7 @@ int _mi_search_next(register MI_INFO *info, register MI_KEYDEF *keyinfo,
{
int error;
uint nod_flag;
- uchar lastkey[MI_MAX_KEY_BUFF];
+ uchar lastkey[HA_MAX_KEY_BUFF];
DBUG_ENTER("_mi_search_next");
DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: %lu",
nextflag, (ulong) info->lastpos,
diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c
index a68bcbed56c..8e491823939 100644
--- a/storage/myisam/mi_test1.c
+++ b/storage/myisam/mi_test1.c
@@ -71,14 +71,16 @@ static int run_test(const char *filename)
/* First define 2 columns */
recinfo[0].type=FIELD_NORMAL; recinfo[0].length=1; /* For NULL bits */
recinfo[1].type=key_field;
- recinfo[1].length= (key_field == FIELD_BLOB ? 4+mi_portable_sizeof_char_ptr :
+ recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
key_length);
if (key_field == FIELD_VARCHAR)
recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);;
recinfo[2].type=extra_field;
- recinfo[2].length= (extra_field == FIELD_BLOB ? 4 + mi_portable_sizeof_char_ptr : 24);
+ recinfo[2].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24);
if (extra_field == FIELD_VARCHAR)
recinfo[2].length+= HA_VARCHAR_PACKLENGTH(recinfo[2].length);
+ recinfo[1].null_bit= null_fields ? 2 : 0;
+
if (opt_unique)
{
recinfo[3].type=FIELD_CHECK;
@@ -630,7 +632,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
key_type= HA_KEYTYPE_VARTEXT1;
break;
case 'k':
- if (key_length < 4 || key_length > MI_MAX_KEY_LENGTH)
+ if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
{
fprintf(stderr,"Wrong key length\n");
exit(1);
diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c
index 902801b5e6e..fd8adeed1c5 100644
--- a/storage/myisam/mi_test2.c
+++ b/storage/myisam/mi_test2.c
@@ -26,6 +26,7 @@
#endif
#include "myisamdef.h"
#include <m_ctype.h>
+#include <my_bit.h>
#define STANDARD_LENGTH 37
#define MYISAM_KEYS 6
@@ -187,7 +188,7 @@ int main(int argc, char *argv[])
if (use_blob)
{
recinfo[6].type=FIELD_BLOB;
- recinfo[6].length=4+mi_portable_sizeof_char_ptr;
+ recinfo[6].length=4+portable_sizeof_char_ptr;
recinfo[6].null_bit=0;
recinfo[6].null_pos=0;
}
@@ -605,7 +606,7 @@ int main(int argc, char *argv[])
if (mi_rsame(file,read_record2,(int) i)) goto err;
if (bcmp(read_record,read_record2,reclength) != 0)
{
- printf("is_rsame didn't find same record\n");
+ printf("mi_rsame didn't find same record\n");
goto end;
}
}
@@ -779,8 +780,7 @@ int main(int argc, char *argv[])
{
ulong blob_length,pos;
uchar *ptr;
- longget(blob_length,read_record+blob_pos+4);
- ptr=(uchar*) blob_length;
+ memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr));
longget(blob_length,read_record+blob_pos);
for (pos=0 ; pos < blob_length ; pos++)
{
diff --git a/storage/myisam/mi_unique.c b/storage/myisam/mi_unique.c
index e490fb683e4..02fcd9289dd 100644
--- a/storage/myisam/mi_unique.c
+++ b/storage/myisam/mi_unique.c
@@ -212,7 +212,7 @@ int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b,
if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
type == HA_KEYTYPE_VARTEXT2)
{
- if (mi_compare_text(keyseg->charset, (uchar *) pos_a, a_length,
+ if (ha_compare_text(keyseg->charset, (uchar *) pos_a, a_length,
(uchar *) pos_b, b_length, 0, 1))
return 1;
}
diff --git a/storage/myisam/mi_update.c b/storage/myisam/mi_update.c
index 956334b7806..dc6a1659931 100644
--- a/storage/myisam/mi_update.c
+++ b/storage/myisam/mi_update.c
@@ -23,7 +23,7 @@ int mi_update(register MI_INFO *info, const uchar *oldrec, uchar *newrec)
int flag,key_changed,save_errno;
reg3 my_off_t pos;
uint i;
- uchar old_key[MI_MAX_KEY_BUFF],*new_key;
+ uchar old_key[HA_MAX_KEY_BUFF],*new_key;
bool auto_key_changed=0;
ulonglong changed;
MYISAM_SHARE *share=info->s;
diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c
index 70ba7a4588a..05372ad12e8 100644
--- a/storage/myisam/mi_write.c
+++ b/storage/myisam/mi_write.c
@@ -346,7 +346,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
int error,flag;
uint nod_flag, search_key_length;
uchar *temp_buff,*keypos;
- uchar keybuff[MI_MAX_KEY_BUFF];
+ uchar keybuff[HA_MAX_KEY_BUFF];
my_bool was_last_key;
my_off_t next_page, dupp_key_pos;
DBUG_ENTER("w_search");
@@ -354,7 +354,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY;
if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
- MI_MAX_KEY_BUFF*2)))
+ HA_MAX_KEY_BUFF*2)))
DBUG_RETURN(-1);
if (!_mi_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0))
goto err;
@@ -545,7 +545,7 @@ int _mi_insert(register MI_INFO *info, register MI_KEYDEF *keyinfo,
get_key_length(alen,a);
DBUG_ASSERT(info->ft1_to_ft2==0);
if (alen == blen &&
- mi_compare_text(keyinfo->seg->charset, a, alen, b, blen, 0, 0)==0)
+ ha_compare_text(keyinfo->seg->charset, a, alen, b, blen, 0, 0)==0)
{
/* yup. converting */
info->ft1_to_ft2=(DYNAMIC_ARRAY *)
@@ -707,7 +707,7 @@ static uchar *_mi_find_last_pos(MI_KEYDEF *keyinfo, uchar *page,
{
uint keys,length,last_length,key_ref_length;
uchar *end,*lastpos,*prevpos;
- uchar key_buff[MI_MAX_KEY_BUFF];
+ uchar key_buff[HA_MAX_KEY_BUFF];
DBUG_ENTER("_mi_find_last_pos");
key_ref_length=2;
@@ -764,7 +764,7 @@ static int _mi_balance_page(register MI_INFO *info, MI_KEYDEF *keyinfo,
length,keys;
uchar *pos,*buff,*extra_buff;
my_off_t next_page,new_pos;
- uchar tmp_part_key[MI_MAX_KEY_BUFF];
+ uchar tmp_part_key[HA_MAX_KEY_BUFF];
DBUG_ENTER("_mi_balance_page");
k_length=keyinfo->keylength;
@@ -930,7 +930,7 @@ static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param)
Probably I can use info->lastkey here, but I'm not sure,
and to be safe I'd better use local lastkey.
*/
- uchar lastkey[MI_MAX_KEY_BUFF];
+ uchar lastkey[HA_MAX_KEY_BUFF];
uint keylen;
MI_KEYDEF *keyinfo;
diff --git a/storage/myisam/myisamchk.c b/storage/myisam/myisamchk.c
index 567e1057e5d..80159518cd0 100644
--- a/storage/myisam/myisamchk.c
+++ b/storage/myisam/myisamchk.c
@@ -16,10 +16,10 @@
/* Describe, check and repair of MyISAM tables */
#include "fulltext.h"
-
#include <m_ctype.h>
#include <stdarg.h>
#include <my_getopt.h>
+#include <my_bit.h>
#ifdef HAVE_SYS_VADVICE_H
#include <sys/vadvise.h>
#endif
@@ -67,9 +67,9 @@ static const char *myisam_stats_method_str="nulls_unequal";
static void get_options(int *argc,char * * *argv);
static void print_version(void);
static void usage(void);
-static int myisamchk(MI_CHECK *param, char *filename);
-static void descript(MI_CHECK *param, register MI_INFO *info, char * name);
-static int mi_sort_records(MI_CHECK *param, register MI_INFO *info,
+static int myisamchk(HA_CHECK *param, char *filename);
+static void descript(HA_CHECK *param, register MI_INFO *info, char * name);
+static int mi_sort_records(HA_CHECK *param, register MI_INFO *info,
char * name, uint sort_key,
my_bool write_info, my_bool update_index);
static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info,
@@ -77,7 +77,7 @@ static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info,
my_off_t page,uchar *buff,uint sortkey,
File new_file, my_bool update_index);
-MI_CHECK check_param;
+HA_CHECK check_param;
/* Main program */
@@ -695,7 +695,7 @@ get_one_option(int optid,
case OPT_STATS_METHOD:
{
int method;
- enum_mi_stats_method method_conv;
+ enum_handler_stats_method method_conv;
LINT_INIT(method_conv);
myisam_stats_method_str= argument;
if ((method=find_type(argument, &myisam_stats_method_typelib, 2)) <= 0)
@@ -794,7 +794,7 @@ static void get_options(register int *argc,register char ***argv)
/* Check table */
-static int myisamchk(MI_CHECK *param, char * filename)
+static int myisamchk(HA_CHECK *param, char * filename)
{
int error,lock_type,recreate;
int rep_quick= param->testflag & (T_QUICK | T_FORCE_UNIQUENESS);
@@ -1199,7 +1199,7 @@ end2:
/* Write info about table */
-static void descript(MI_CHECK *param, register MI_INFO *info, char * name)
+static void descript(HA_CHECK *param, register MI_INFO *info, char * name)
{
uint key,keyseg_nr,field,start;
reg3 MI_KEYDEF *keyinfo;
@@ -1464,7 +1464,7 @@ static void descript(MI_CHECK *param, register MI_INFO *info, char * name)
/* Sort records according to one key */
-static int mi_sort_records(MI_CHECK *param,
+static int mi_sort_records(HA_CHECK *param,
register MI_INFO *info, char * name,
uint sort_key,
my_bool write_info,
@@ -1478,7 +1478,7 @@ static int mi_sort_records(MI_CHECK *param,
ha_rows old_record_count;
MYISAM_SHARE *share=info->s;
char llbuff[22],llbuff2[22];
- SORT_INFO sort_info;
+ MI_SORT_INFO sort_info;
MI_SORT_PARAM sort_param;
DBUG_ENTER("sort_records");
@@ -1653,10 +1653,10 @@ static int sort_record_index(MI_SORT_PARAM *sort_param,MI_INFO *info,
uint nod_flag,used_length,key_length;
uchar *temp_buff,*keypos,*endpos;
my_off_t next_page,rec_pos;
- uchar lastkey[MI_MAX_KEY_BUFF];
+ uchar lastkey[HA_MAX_KEY_BUFF];
char llbuff[22];
- SORT_INFO *sort_info= sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
DBUG_ENTER("sort_record_index");
nod_flag=mi_test_if_nod(buff);
@@ -1744,7 +1744,7 @@ err:
static int not_killed= 0;
-volatile int *killed_ptr(MI_CHECK *param __attribute__((unused)))
+volatile int *killed_ptr(HA_CHECK *param __attribute__((unused)))
{
return &not_killed; /* always NULL */
}
@@ -1752,7 +1752,7 @@ volatile int *killed_ptr(MI_CHECK *param __attribute__((unused)))
/* print warnings and errors */
/* VARARGS */
-void mi_check_print_info(MI_CHECK *param __attribute__((unused)),
+void mi_check_print_info(HA_CHECK *param __attribute__((unused)),
const char *fmt,...)
{
va_list args;
@@ -1765,7 +1765,7 @@ void mi_check_print_info(MI_CHECK *param __attribute__((unused)),
/* VARARGS */
-void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_warning(HA_CHECK *param, const char *fmt,...)
{
va_list args;
DBUG_ENTER("mi_check_print_warning");
@@ -1790,7 +1790,7 @@ void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
/* VARARGS */
-void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_error(HA_CHECK *param, const char *fmt,...)
{
va_list args;
DBUG_ENTER("mi_check_print_error");
diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h
index 721d6b9f271..b0b6da03015 100644
--- a/storage/myisam/myisamdef.h
+++ b/storage/myisam/myisamdef.h
@@ -15,8 +15,8 @@
/* This file is included by all internal myisam files */
-#include "myisam.h" /* Structs & some defines */
-#include "myisampack.h" /* packing of keys */
+#include "myisam.h" /* Structs & some defines */
+#include "myisampack.h" /* packing of keys */
#include <my_tree.h>
#ifdef THREAD
#include <my_pthread.h>
@@ -26,15 +26,16 @@
#endif
#if defined(my_write) && !defined(MAP_TO_USE_RAID)
-#undef my_write /* undef map from my_nosys; We need test-if-disk full */
+/* undef map from my_nosys; We need test-if-disk full */
+#undef my_write
#endif
typedef struct st_mi_status_info
{
- ha_rows records; /* Rows in table */
- ha_rows del; /* Removed rows */
- my_off_t empty; /* lost space in datafile */
- my_off_t key_empty; /* lost space in indexfile */
+ ha_rows records; /* Rows in table */
+ ha_rows del; /* Removed rows */
+ my_off_t empty; /* lost space in datafile */
+ my_off_t key_empty; /* lost space in indexfile */
my_off_t key_file_length;
my_off_t data_file_length;
ha_checksum checksum;
@@ -42,347 +43,292 @@ typedef struct st_mi_status_info
typedef struct st_mi_state_info
{
- struct { /* Fileheader */
+ struct
+ { /* Fileheader */
uchar file_version[4];
uchar options[2];
uchar header_length[2];
uchar state_info_length[2];
uchar base_info_length[2];
uchar base_pos[2];
- uchar key_parts[2]; /* Key parts */
- uchar unique_key_parts[2]; /* Key parts + unique parts */
- uchar keys; /* number of keys in file */
- uchar uniques; /* number of UNIQUE definitions */
- uchar language; /* Language for indexes */
- uchar max_block_size_index; /* max keyblock size */
+ uchar key_parts[2]; /* Key parts */
+ uchar unique_key_parts[2]; /* Key parts + unique parts */
+ uchar keys; /* number of keys in file */
+ uchar uniques; /* number of UNIQUE definitions */
+ uchar language; /* Language for indexes */
+ uchar max_block_size_index; /* max keyblock size */
uchar fulltext_keys;
uchar not_used; /* To align to 8 */
} header;
MI_STATUS_INFO state;
- ha_rows split; /* number of split blocks */
- my_off_t dellink; /* Link to next removed block */
+ ha_rows split; /* number of split blocks */
+ my_off_t dellink; /* Link to next removed block */
ulonglong auto_increment;
- ulong process; /* process that updated table last */
- ulong unique; /* Unique number for this process */
- ulong update_count; /* Updated for each write lock */
+ ulong process; /* process that updated table last */
+ ulong unique; /* Unique number for this process */
+ ulong update_count; /* Updated for each write lock */
ulong status;
ulong *rec_per_key_part;
- my_off_t *key_root; /* Start of key trees */
- my_off_t *key_del; /* delete links for trees */
- my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */
-
- ulong sec_index_changed; /* Updated when new sec_index */
- ulong sec_index_used; /* which extra index are in use */
- ulonglong key_map; /* Which keys are in use */
ha_checksum checksum; /* Table checksum */
- ulong version; /* timestamp of create */
- time_t create_time; /* Time when created database */
- time_t recover_time; /* Time for last recover */
- time_t check_time; /* Time for last check */
- uint sortkey; /* sorted by this key (not used) */
+ my_off_t *key_root; /* Start of key trees */
+ my_off_t *key_del; /* delete links for trees */
+ my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */
+
+ ulong sec_index_changed; /* Updated when new sec_index */
+ ulong sec_index_used; /* which extra index are in use */
+ ulonglong key_map; /* Which keys are in use */
+ ulong version; /* timestamp of create */
+ time_t create_time; /* Time when created database */
+ time_t recover_time; /* Time for last recover */
+ time_t check_time; /* Time for last check */
+ uint sortkey; /* sorted by this key (not used) */
uint open_count;
- uint8 changed; /* Changed since myisamchk */
+ uint8 changed; /* Changed since myisamchk */
/* the following isn't saved on disk */
- uint state_diff_length; /* Should be 0 */
- uint state_length; /* Length of state header in file */
+ uint state_diff_length; /* Should be 0 */
+ uint state_length; /* Length of state header in file */
ulong *key_info;
} MI_STATE_INFO;
-#define MI_STATE_INFO_SIZE (24+14*8+7*4+2*2+8)
-#define MI_STATE_KEY_SIZE 8
+#define MI_STATE_INFO_SIZE (24+14*8+7*4+2*2+8)
+#define MI_STATE_KEY_SIZE 8
#define MI_STATE_KEYBLOCK_SIZE 8
-#define MI_STATE_KEYSEG_SIZE 4
-#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*MI_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE)
-#define MI_KEYDEF_SIZE (2+ 5*2)
-#define MI_UNIQUEDEF_SIZE (2+1+1)
-#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2)
-#define MI_COLUMNDEF_SIZE (2*3+1)
-#define MI_BASE_INFO_SIZE (5*8 + 8*4 + 4 + 4*2 + 16)
-#define MI_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */
+#define MI_STATE_KEYSEG_SIZE 4
+#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*HA_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE)
+#define MI_KEYDEF_SIZE (2+ 5*2)
+#define MI_UNIQUEDEF_SIZE (2+1+1)
+#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2)
+#define MI_COLUMNDEF_SIZE (2*3+1)
+#define MI_BASE_INFO_SIZE (5*8 + 8*4 + 4 + 4*2 + 16)
+#define MI_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */
typedef struct st_mi_base_info
{
- my_off_t keystart; /* Start of keys */
+ my_off_t keystart; /* Start of keys */
my_off_t max_data_file_length;
my_off_t max_key_file_length;
my_off_t margin_key_file_length;
- ha_rows records,reloc; /* Create information */
- ulong mean_row_length; /* Create information */
- ulong reclength; /* length of unpacked record */
- ulong pack_reclength; /* Length of full packed rec. */
+ ha_rows records, reloc; /* Create information */
+ ulong mean_row_length; /* Create information */
+ ulong reclength; /* length of unpacked record */
+ ulong pack_reclength; /* Length of full packed rec. */
ulong min_pack_length;
- ulong max_pack_length; /* Max possibly length of packed rec.*/
+ ulong max_pack_length; /* Max possibly length of packed rec.*/
ulong min_block_length;
- ulong fields, /* fields in table */
- pack_fields; /* packed fields in table */
- uint rec_reflength; /* = 2-8 */
- uint key_reflength; /* = 2-8 */
- uint keys; /* same as in state.header */
- uint auto_key; /* Which key-1 is a auto key */
- uint blobs; /* Number of blobs */
- uint pack_bits; /* Length of packed bits */
- uint max_key_block_length; /* Max block length */
- uint max_key_length; /* Max key length */
+ ulong fields, /* fields in table */
+ pack_fields; /* packed fields in table */
+ uint rec_reflength; /* = 2-8 */
+ uint key_reflength; /* = 2-8 */
+ uint keys; /* same as in state.header */
+ uint auto_key; /* Which key-1 is a auto key */
+ uint blobs; /* Number of blobs */
+ uint pack_bits; /* Length of packed bits */
+ uint max_key_block_length; /* Max block length */
+ uint max_key_length; /* Max key length */
/* Extra allocation when using dynamic record format */
uint extra_alloc_bytes;
uint extra_alloc_procent;
/* Info about raid */
- uint raid_type,raid_chunks;
+ uint raid_type, raid_chunks;
ulong raid_chunksize;
/* The following are from the header */
- uint key_parts,all_key_parts;
+ uint key_parts, all_key_parts;
} MI_BASE_INFO;
- /* Structs used intern in database */
+ /* Structs used intern in database */
-typedef struct st_mi_blob /* Info of record */
+typedef struct st_mi_blob /* Info of record */
{
- ulong offset; /* Offset to blob in record */
- uint pack_length; /* Type of packed length */
- ulong length; /* Calc:ed for each record */
+ ulong offset; /* Offset to blob in record */
+ uint pack_length; /* Type of packed length */
+ ulong length; /* Calc:ed for each record */
} MI_BLOB;
-typedef struct st_mi_isam_pack {
+typedef struct st_mi_isam_pack
+{
ulong header_length;
uint ref_length;
uchar version;
} MI_PACK;
-#define MAX_NONMAPPED_INSERTS 1000
+#define MAX_NONMAPPED_INSERTS 1000
-typedef struct st_mi_isam_share { /* Shared between opens */
+typedef struct st_mi_isam_share
+{ /* Shared between opens */
MI_STATE_INFO state;
MI_BASE_INFO base;
- MI_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */
- MI_KEYDEF *keyinfo; /* Key definitions */
- MI_UNIQUEDEF *uniqueinfo; /* unique definitions */
- HA_KEYSEG *keyparts; /* key part info */
- MI_COLUMNDEF *rec; /* Pointer to field information */
- MI_PACK pack; /* Data about packed records */
- MI_BLOB *blobs; /* Pointer to blobs */
- char *unique_file_name; /* realpath() of index file */
- char *data_file_name, /* Resolved path names from symlinks */
- *index_file_name;
- uchar *file_map; /* mem-map of file if possible */
- KEY_CACHE *key_cache; /* ref to the current key cache */
+ MI_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */
+ MI_KEYDEF *keyinfo; /* Key definitions */
+ MI_UNIQUEDEF *uniqueinfo; /* unique definitions */
+ HA_KEYSEG *keyparts; /* key part info */
+ MI_COLUMNDEF *rec; /* Pointer to field information */
+ MI_PACK pack; /* Data about packed records */
+ MI_BLOB *blobs; /* Pointer to blobs */
+ char *unique_file_name; /* realpath() of index file */
+ char *data_file_name, /* Resolved path names from symlinks */
+ *index_file_name;
+ uchar *file_map; /* mem-map of file if possible */
+ KEY_CACHE *key_cache; /* ref to the current key cache */
MI_DECODE_TREE *decode_trees;
uint16 *decode_tables;
- int (*read_record)(struct st_myisam_info*, my_off_t, uchar*);
- int (*write_record)(struct st_myisam_info*, const uchar*);
- int (*update_record)(struct st_myisam_info*, my_off_t, const uchar*);
- int (*delete_record)(struct st_myisam_info*);
- int (*read_rnd)(struct st_myisam_info*, uchar*, my_off_t, my_bool);
- int (*compare_record)(struct st_myisam_info*, const uchar *);
/* Function to use for a row checksum. */
- ha_checksum (*calc_checksum)(struct st_myisam_info*, const uchar *);
- int (*compare_unique)(struct st_myisam_info*, MI_UNIQUEDEF *,
- const uchar *record, my_off_t pos);
- size_t (*file_read)(MI_INFO *, uchar *, size_t, my_off_t, myf);
- size_t (*file_write)(MI_INFO *, const uchar *, size_t, my_off_t, myf);
+ int(*read_record) (struct st_myisam_info *, my_off_t, uchar*);
+ int(*write_record) (struct st_myisam_info *, const uchar*);
+ int(*update_record) (struct st_myisam_info *, my_off_t, const uchar*);
+ int(*delete_record) (struct st_myisam_info *);
+ int(*read_rnd) (struct st_myisam_info *, uchar*, my_off_t, my_bool);
+ int(*compare_record) (struct st_myisam_info *, const uchar*);
+ ha_checksum(*calc_checksum) (struct st_myisam_info *, const uchar*);
+ int(*compare_unique) (struct st_myisam_info *, MI_UNIQUEDEF *,
+ const uchar *record, my_off_t pos);
+ uint(*file_read) (MI_INFO *, uchar *, size_t, my_off_t, myf);
+ uint(*file_write) (MI_INFO *, const uchar *, size_t, my_off_t, myf);
invalidator_by_filename invalidator; /* query cache invalidator */
- ulong this_process; /* processid */
- ulong last_process; /* For table-change-check */
- ulong last_version; /* Version on start */
- ulong options; /* Options used */
- ulong min_pack_length; /* Theese are used by packed data */
+ ulong this_process; /* processid */
+ ulong last_process; /* For table-change-check */
+ ulong last_version; /* Version on start */
+ ulong options; /* Options used */
+ ulong min_pack_length; /* Theese are used by packed data */
ulong max_pack_length;
ulong state_diff_length;
- uint rec_reflength; /* rec_reflength in use now */
- uint unique_name_length;
+ uint rec_reflength; /* rec_reflength in use now */
+ uint unique_name_length;
uint32 ftparsers; /* Number of distinct ftparsers + 1 */
- File kfile; /* Shared keyfile */
- File data_file; /* Shared data file */
- int mode; /* mode of file on open */
- uint reopen; /* How many times reopened */
- uint w_locks,r_locks,tot_locks; /* Number of read/write locks */
- uint blocksize; /* blocksize of keyfile */
+ File kfile; /* Shared keyfile */
+ File data_file; /* Shared data file */
+ int mode; /* mode of file on open */
+ uint reopen; /* How many times reopened */
+ uint w_locks, r_locks, tot_locks; /* Number of read/write locks */
+ uint blocksize; /* blocksize of keyfile */
myf write_flag;
enum data_file_type data_file_type;
/* Below flag is needed to make log tables work with concurrent insert */
my_bool is_log_table;
- my_bool changed, /* If changed since lock */
- global_changed, /* If changed since open */
- not_flushed,
- temporary,delay_key_write,
- concurrent_insert;
+ my_bool changed, /* If changed since lock */
+ global_changed, /* If changed since open */
+ not_flushed, temporary, delay_key_write, concurrent_insert;
#ifdef THREAD
THR_LOCK lock;
- pthread_mutex_t intern_lock; /* Locking for use with _locking */
+ pthread_mutex_t intern_lock; /* Locking for use with _locking */
rw_lock_t *key_root_lock;
#endif
my_off_t mmaped_length;
- uint nonmmaped_inserts; /* counter of writing in non-mmaped
- area */
+ /* counter of writing in non-mmaped area */
+ uint nonmmaped_inserts;
rw_lock_t mmap_lock;
} MYISAM_SHARE;
-typedef uint mi_bit_type;
-
-typedef struct st_mi_bit_buff { /* Used for packing of record */
- mi_bit_type current_byte;
- uint bits;
- uchar *pos,*end,*blob_pos,*blob_end;
- uint error;
-} MI_BIT_BUFF;
-
-struct st_myisam_info {
- MYISAM_SHARE *s; /* Shared between open:s */
- MI_STATUS_INFO *state,save_state;
- MI_BLOB *blobs; /* Pointer to blobs */
- MI_BIT_BUFF bit_buff;
+struct st_myisam_info
+{
+ MYISAM_SHARE *s; /* Shared between open:s */
+ MI_STATUS_INFO *state, save_state;
+ MI_BLOB *blobs; /* Pointer to blobs */
+ MI_BIT_BUFF bit_buff;
/* accumulate indexfile changes between write's */
- TREE *bulk_insert;
+ TREE *bulk_insert;
DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */
MEM_ROOT ft_memroot; /* used by the parser */
- MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */
- char *filename; /* parameter to open filename */
- uchar *buff, /* Temp area for key */
- *lastkey,*lastkey2; /* Last used search key */
- uchar *first_mbr_key; /* Searhed spatial key */
- uchar *rec_buff; /* Tempbuff for recordpack */
- uchar *int_keypos, /* Save position for next/previous */
- *int_maxpos; /* -""- */
- uint int_nod_flag; /* -""- */
- uint32 int_keytree_version; /* -""- */
- int (*read_record)(struct st_myisam_info*, my_off_t, uchar*);
+ MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */
+ char *filename; /* parameter to open filename */
+ uchar *buff, /* Temp area for key */
+ *lastkey, *lastkey2; /* Last used search key */
+ uchar *first_mbr_key; /* Searhed spatial key */
+ uchar *rec_buff; /* Tempbuff for recordpack */
+ uchar *int_keypos, /* Save position for next/previous */
+ *int_maxpos; /* -""- */
+ uint int_nod_flag; /* -""- */
+ uint32 int_keytree_version; /* -""- */
+ int(*read_record) (struct st_myisam_info *, my_off_t, uchar *);
invalidator_by_filename invalidator; /* query cache invalidator */
- ulong this_unique; /* uniq filenumber or thread */
- ulong last_unique; /* last unique number */
- ulong this_loop; /* counter for this open */
- ulong last_loop; /* last used counter */
- my_off_t lastpos, /* Last record position */
- nextpos; /* Position to next record */
+ ulong this_unique; /* uniq filenumber or thread */
+ ulong last_unique; /* last unique number */
+ ulong this_loop; /* counter for this open */
+ ulong last_loop; /* last used counter */
+ my_off_t lastpos, /* Last record position */
+ nextpos; /* Position to next record */
my_off_t save_lastpos;
- my_off_t pos; /* Intern variable */
- my_off_t last_keypage; /* Last key page read */
- my_off_t last_search_keypage; /* Last keypage when searching */
+ my_off_t pos; /* Intern variable */
+ my_off_t last_keypage; /* Last key page read */
+ my_off_t last_search_keypage; /* Last keypage when searching */
my_off_t dupp_key_pos;
ha_checksum checksum; /* Temp storage for row checksum */
- /* QQ: the folloing two xxx_length fields should be removed,
- as they are not compatible with parallel repair */
- ulong packed_length,blob_length; /* Length of found, packed record */
- int dfile; /* The datafile */
- uint opt_flag; /* Optim. for space/speed */
- uint update; /* If file changed since open */
- int lastinx; /* Last used index */
- uint lastkey_length; /* Length of key in lastkey */
- uint last_rkey_length; /* Last length in mi_rkey() */
+ /*
+ QQ: the folloing two xxx_length fields should be removed,
+ as they are not compatible with parallel repair
+ */
+ ulong packed_length, blob_length; /* Length of found, packed record */
+ int dfile; /* The datafile */
+ uint opt_flag; /* Optim. for space/speed */
+ uint update; /* If file changed since open */
+ int lastinx; /* Last used index */
+ uint lastkey_length; /* Length of key in lastkey */
+ uint last_rkey_length; /* Last length in mi_rkey() */
enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */
- uint save_lastkey_length;
- uint pack_key_length; /* For MYISAMMRG */
+ uint save_lastkey_length;
+ uint pack_key_length; /* For MYISAMMRG */
uint16 last_used_keyseg; /* For MyISAMMRG */
- int errkey; /* Got last error on this key */
- int lock_type; /* How database was locked */
- int tmp_lock_type; /* When locked by readinfo */
- uint data_changed; /* Somebody has changed data */
- uint save_update; /* When using KEY_READ */
- int save_lastinx;
- LIST open_list;
- IO_CACHE rec_cache; /* When cacheing records */
- uint preload_buff_size; /* When preloading indexes */
- myf lock_wait; /* is 0 or MY_DONT_WAIT */
- my_bool was_locked; /* Was locked in panic */
- my_bool append_insert_at_end; /* Set if concurrent insert */
+ int errkey; /* Got last error on this key */
+ int lock_type; /* How database was locked */
+ int tmp_lock_type; /* When locked by readinfo */
+ uint data_changed; /* Somebody has changed data */
+ uint save_update; /* When using KEY_READ */
+ int save_lastinx;
+ LIST open_list;
+ IO_CACHE rec_cache; /* When cacheing records */
+ uint preload_buff_size; /* When preloading indexes */
+ myf lock_wait; /* is 0 or MY_DONT_WAIT */
+ my_bool was_locked; /* Was locked in panic */
+ my_bool append_insert_at_end; /* Set if concurrent insert */
my_bool quick_mode;
- my_bool page_changed; /* If info->buff can't be used for rnext */
- my_bool buff_used; /* If info->buff has to be reread for rnext */
- my_bool once_flags; /* For MYISAMMRG */
+ /* If info->buff can't be used for rnext */
+ my_bool page_changed;
+ /* If info->buff has to be reread for rnext */
+ my_bool buff_used;
+ my_bool once_flags; /* For MYISAMMRG */
#ifdef __WIN__
my_bool owned_by_merge; /* This MyISAM table is part of a merge union */
#endif
#ifdef THREAD
THR_LOCK_DATA lock;
#endif
- uchar *rtree_recursion_state; /* For RTREE */
- int rtree_recursion_depth;
+ uchar *rtree_recursion_state; /* For RTREE */
+ int rtree_recursion_depth;
};
-typedef struct st_buffpek {
- my_off_t file_pos; /* Where we are in the sort file */
- uchar *base,*key; /* Key pointers */
- ha_rows count; /* Number of rows in table */
- ulong mem_count; /* numbers of keys in memory */
- ulong max_keys; /* Max keys in buffert */
-} BUFFPEK;
-
-typedef struct st_mi_sort_param
-{
- pthread_t thr;
- IO_CACHE read_cache, tempfile, tempfile_for_exceptions;
- DYNAMIC_ARRAY buffpek;
- MI_BIT_BUFF bit_buff; /* For parallel repair of packrec. */
-
- /*
- The next two are used to collect statistics, see update_key_parts for
- description.
- */
- ulonglong unique[MI_MAX_KEY_SEG+1];
- ulonglong notnull[MI_MAX_KEY_SEG+1];
-
- my_off_t pos,max_pos,filepos,start_recpos;
- uint key, key_length,real_key_length,sortbuff_size;
- uint maxbuffers, keys, find_length, sort_keys_length;
- my_bool fix_datafile, master;
- my_bool calc_checksum; /* calculate table checksum */
- MI_KEYDEF *keyinfo;
- HA_KEYSEG *seg;
- SORT_INFO *sort_info;
- uchar **sort_keys;
- uchar *rec_buff;
- void *wordlist, *wordptr;
- MEM_ROOT wordroot;
- uchar *record;
- MY_TMPDIR *tmpdir;
- int (*key_cmp)(struct st_mi_sort_param *, const void *, const void *);
- int (*key_read)(struct st_mi_sort_param *,void *);
- int (*key_write)(struct st_mi_sort_param *, const void *);
- void (*lock_in_memory)(MI_CHECK *);
- NEAR int (*write_keys)(struct st_mi_sort_param *, register uchar **,
- uint , struct st_buffpek *, IO_CACHE *);
- NEAR uint (*read_to_buffer)(IO_CACHE *,struct st_buffpek *, uint);
- NEAR int (*write_key)(struct st_mi_sort_param *, IO_CACHE *,uchar *,
- uint, uint);
-} MI_SORT_PARAM;
-
- /* Some defines used by isam-funktions */
-
-#define USE_WHOLE_KEY MI_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */
-#define F_EXTRA_LCK -1
-
- /* bits in opt_flag */
-#define MEMMAP_USED 32
+#define USE_WHOLE_KEY HA_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */
+#define F_EXTRA_LCK -1
+/* bits in opt_flag */
+#define MEMMAP_USED 32
#define REMEMBER_OLD_POS 64
-#define WRITEINFO_UPDATE_KEYFILE 1
-#define WRITEINFO_NO_UNLOCK 2
+#define WRITEINFO_UPDATE_KEYFILE 1
+#define WRITEINFO_NO_UNLOCK 2
- /* once_flags */
+/* once_flags */
#define USE_PACKED_KEYS 1
#define RRND_PRESERVE_LASTINX 2
- /* bits in state.changed */
-
-#define STATE_CHANGED 1
-#define STATE_CRASHED 2
+/* bits in state.changed */
+#define STATE_CHANGED 1
+#define STATE_CRASHED 2
#define STATE_CRASHED_ON_REPAIR 4
-#define STATE_NOT_ANALYZED 8
+#define STATE_NOT_ANALYZED 8
#define STATE_NOT_OPTIMIZED_KEYS 16
-#define STATE_NOT_SORTED_PAGES 32
-
- /* options to mi_read_cache */
+#define STATE_NOT_SORTED_PAGES 32
-#define READING_NEXT 1
-#define READING_HEADER 2
+/* options to mi_read_cache */
+#define READING_NEXT 1
+#define READING_HEADER 2
-#define mi_getint(x) ((uint) mi_uint2korr(x) & 32767)
+#define mi_getint(x) ((uint) mi_uint2korr(x) & 32767)
#define mi_putint(x,y,nod) { uint16 boh=(nod ? (uint16) 32768 : 0) + (uint16) (y);\
- mi_int2store(x,boh); }
+ mi_int2store(x,boh); }
#define mi_test_if_nod(x) (x[0] & 128 ? info->s->base.key_reflength : 0)
#define mi_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
DBUG_PRINT("error", ("Marked table crashed")); \
@@ -400,13 +346,6 @@ typedef struct st_mi_sort_param
/* Functions to store length of space packed keys, VARCHAR or BLOB keys */
-#define store_key_length_inc(key,length) \
-{ if ((length) < 255) \
- { *(key)++=(length); } \
- else \
- { *(key)=255; mi_int2store((key)+1,(length)); (key)+=3; } \
-}
-
#define store_key_length(key,length) \
{ if ((length) < 255) \
{ *(key)=(length); } \
@@ -430,39 +369,39 @@ typedef struct st_mi_sort_param
#define get_pack_length(length) ((length) >= 255 ? 3 : 1)
-#define MI_MIN_BLOCK_LENGTH 20 /* Because of delete-link */
-#define MI_EXTEND_BLOCK_LENGTH 20 /* Don't use to small record-blocks */
-#define MI_SPLIT_LENGTH ((MI_EXTEND_BLOCK_LENGTH+4)*2)
-#define MI_MAX_DYN_BLOCK_HEADER 20 /* Max prefix of record-block */
+#define MI_MIN_BLOCK_LENGTH 20 /* Because of delete-link */
+#define MI_EXTEND_BLOCK_LENGTH 20 /* Don't use to small record-blocks */
+#define MI_SPLIT_LENGTH ((MI_EXTEND_BLOCK_LENGTH+4)*2)
+#define MI_MAX_DYN_BLOCK_HEADER 20 /* Max prefix of record-block */
#define MI_BLOCK_INFO_HEADER_LENGTH 20
-#define MI_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */
-#define MI_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L)
-#define MI_DYN_MAX_ROW_LENGTH (MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH)
-#define MI_DYN_ALIGN_SIZE 4 /* Align blocks on this */
-#define MI_MAX_DYN_HEADER_BYTE 13 /* max header byte for dynamic rows */
-#define MI_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1)))
+#define MI_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */
+#define MI_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L)
+#define MI_DYN_MAX_ROW_LENGTH (MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH)
+#define MI_DYN_ALIGN_SIZE 4 /* Align blocks on this */
+#define MI_MAX_DYN_HEADER_BYTE 13 /* max header byte for dynamic rows */
+#define MI_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1)))
#define MI_REC_BUFF_OFFSET ALIGN_SIZE(MI_DYN_DELETE_BLOCK_HEADER+sizeof(uint32))
-#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */
+#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */
-#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */
-#define PACK_TYPE_SPACE_FIELDS 2
-#define PACK_TYPE_ZERO_FILL 4
-#define MI_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */
+#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */
+#define PACK_TYPE_SPACE_FIELDS 2
+#define PACK_TYPE_ZERO_FILL 4
+#define MI_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */
-#define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
+#define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
#define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
-#define MI_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */
-#define MI_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */
+#define MI_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */
+#define MI_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */
-#define MI_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */
+#define MI_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */
#define MI_MIN_ROWS_TO_USE_BULK_INSERT 100
#define MI_MIN_ROWS_TO_DISABLE_INDEXES 100
#define MI_MIN_ROWS_TO_USE_WRITE_CACHE 10
/* The UNIQUE check is done with a hashed long key */
-#define MI_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT
+#define MI_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT
#define mi_unique_store(A,B) mi_int4store((A),(B))
#ifdef THREAD
@@ -474,175 +413,182 @@ extern pthread_mutex_t THR_LOCK_myisam;
#define rw_unlock(A) {}
#endif
- /* Some extern variables */
+/* Some extern variables */
extern LIST *myisam_open_list;
-extern uchar NEAR myisam_file_magic[],NEAR myisam_pack_file_magic[];
-extern uint NEAR myisam_read_vec[],NEAR myisam_readnext_vec[];
+extern uchar NEAR myisam_file_magic[], NEAR myisam_pack_file_magic[];
+extern uint NEAR myisam_read_vec[], NEAR myisam_readnext_vec[];
extern uint myisam_quick_table_bits;
extern File myisam_log_file;
extern ulong myisam_pid;
- /* This is used by _mi_calc_xxx_key_length och _mi_store_key */
+/* This is used by _mi_calc_xxx_key_length och _mi_store_key */
typedef struct st_mi_s_param
{
- uint ref_length,key_length,
- n_ref_length,
- n_length,
- totlength,
- part_of_prev_key,prev_length,pack_marker;
- uchar *key, *prev_key,*next_key_pos;
- bool store_not_null;
+ uint ref_length, key_length,
+ n_ref_length,
+ n_length, totlength, part_of_prev_key, prev_length, pack_marker;
+ uchar *key, *prev_key, *next_key_pos;
+ bool store_not_null;
} MI_KEY_PARAM;
- /* Prototypes for intern functions */
+/* Prototypes for intern functions */
-extern int _mi_read_dynamic_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_write_dynamic_record(MI_INFO*, const uchar*);
-extern int _mi_update_dynamic_record(MI_INFO*, my_off_t, const uchar*);
+extern int _mi_read_dynamic_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_write_dynamic_record(MI_INFO *, const uchar *);
+extern int _mi_update_dynamic_record(MI_INFO *, my_off_t, const uchar *);
extern int _mi_delete_dynamic_record(MI_INFO *info);
-extern int _mi_cmp_dynamic_record(MI_INFO *info,const uchar *record);
-extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *,my_off_t, my_bool);
-extern int _mi_write_blob_record(MI_INFO*, const uchar*);
-extern int _mi_update_blob_record(MI_INFO*, my_off_t, const uchar*);
-extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos,uchar *buf);
-extern int _mi_write_static_record(MI_INFO*, const uchar*);
-extern int _mi_update_static_record(MI_INFO*, my_off_t, const uchar*);
+extern int _mi_cmp_dynamic_record(MI_INFO *info, const uchar *record);
+extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *, my_off_t, my_bool);
+extern int _mi_write_blob_record(MI_INFO *, const uchar *);
+extern int _mi_update_blob_record(MI_INFO *, my_off_t, const uchar *);
+extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_write_static_record(MI_INFO *, const uchar *);
+extern int _mi_update_static_record(MI_INFO *, my_off_t, const uchar *);
extern int _mi_delete_static_record(MI_INFO *info);
-extern int _mi_cmp_static_record(MI_INFO *info,const uchar *record);
-extern int _mi_read_rnd_static_record(MI_INFO*, uchar *,my_off_t, my_bool);
-extern int _mi_ck_write(MI_INFO *info,uint keynr,uchar *key,uint length);
+extern int _mi_cmp_static_record(MI_INFO *info, const uchar *record);
+extern int _mi_read_rnd_static_record(MI_INFO *, uchar *, my_off_t, my_bool);
+extern int _mi_ck_write(MI_INFO *info, uint keynr, uchar *key, uint length);
extern int _mi_ck_real_write_btree(MI_INFO *info, MI_KEYDEF *keyinfo,
uchar *key, uint key_length,
my_off_t *root, uint comp_flag);
-extern int _mi_enlarge_root(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, my_off_t *root);
-extern int _mi_insert(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
- uchar *anc_buff,uchar *key_pos,uchar *key_buff,
- uchar *father_buff, uchar *father_keypos,
- my_off_t father_page, my_bool insert_last);
-extern int _mi_split_page(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
- uchar *buff,uchar *key_buff, my_bool insert_last);
-extern uchar *_mi_find_half_pos(uint nod_flag,MI_KEYDEF *keyinfo,uchar *page,
- uchar *key,uint *return_key_length,
- uchar **after_key);
-extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
- uchar *key_pos, uchar *org_key,
- uchar *key_buff,
- uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
- uchar *key_pos, uchar *org_key,
- uchar *key_buff,
- uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
- uchar *key_pos, uchar *org_key,
- uchar *prev_key,
- uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
- uchar *key_pos,uchar *org_key,
- uchar *prev_key,
- uchar *key, MI_KEY_PARAM *s_temp);
-void _mi_store_static_key(MI_KEYDEF *keyinfo, uchar *key_pos,
- MI_KEY_PARAM *s_temp);
-void _mi_store_var_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
- MI_KEY_PARAM *s_temp);
+extern int _mi_enlarge_root(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+ my_off_t *root);
+extern int _mi_insert(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+ uchar *anc_buff, uchar *key_pos, uchar *key_buff,
+ uchar *father_buff, uchar *father_keypos,
+ my_off_t father_page, my_bool insert_last);
+extern int _mi_split_page(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+ uchar *buff, uchar *key_buff, my_bool insert_last);
+extern uchar *_mi_find_half_pos(uint nod_flag, MI_KEYDEF *keyinfo,
+ uchar *page, uchar *key,
+ uint *return_key_length, uchar ** after_key);
+extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff, uchar *key,
+ MI_KEY_PARAM *s_temp);
+extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff, uchar *key,
+ MI_KEY_PARAM *s_temp);
+extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *prev_key, uchar *key,
+ MI_KEY_PARAM *s_temp);
+extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *prev_key, uchar *key,
+ MI_KEY_PARAM *s_temp);
+void _mi_store_static_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+ MI_KEY_PARAM *s_temp);
+void _mi_store_var_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+ MI_KEY_PARAM *s_temp);
#ifdef NOT_USED
-void _mi_store_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
- MI_KEY_PARAM *s_temp);
+void _mi_store_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+ MI_KEY_PARAM *s_temp);
#endif
-void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
- MI_KEY_PARAM *s_temp);
+void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+ MI_KEY_PARAM *s_temp);
-extern int _mi_ck_delete(MI_INFO *info,uint keynr,uchar *key,uint key_length);
-extern int _mi_readinfo(MI_INFO *info,int lock_flag,int check_keybuffer);
-extern int _mi_writeinfo(MI_INFO *info,uint options);
+extern int _mi_ck_delete(MI_INFO *info, uint keynr, uchar *key,
+ uint key_length);
+extern int _mi_readinfo(MI_INFO *info, int lock_flag, int check_keybuffer);
+extern int _mi_writeinfo(MI_INFO *info, uint options);
extern int _mi_test_if_changed(MI_INFO *info);
extern int _mi_mark_file_changed(MI_INFO *info);
extern int _mi_decrement_open_count(MI_INFO *info);
-extern int _mi_check_index(MI_INFO *info,int inx);
-extern int _mi_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,uint key_len,
- uint nextflag,my_off_t pos);
-extern int _mi_bin_search(struct st_myisam_info *info,MI_KEYDEF *keyinfo,
- uchar *page,uchar *key,uint key_len,uint comp_flag,
- uchar * *ret_pos,uchar *buff, my_bool *was_last_key);
-extern int _mi_seq_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page,
- uchar *key,uint key_len,uint comp_flag,
- uchar **ret_pos,uchar *buff, my_bool *was_last_key);
-extern int _mi_prefix_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page,
- uchar *key,uint key_len,uint comp_flag,
- uchar **ret_pos,uchar *buff, my_bool *was_last_key);
-extern my_off_t _mi_kpos(uint nod_flag,uchar *after_key);
-extern void _mi_kpointer(MI_INFO *info,uchar *buff,my_off_t pos);
-extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag,uchar *after_key);
+extern int _mi_check_index(MI_INFO *info, int inx);
+extern int _mi_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+ uint key_len, uint nextflag, my_off_t pos);
+extern int _mi_bin_search(struct st_myisam_info *info, MI_KEYDEF *keyinfo,
+ uchar *page, uchar *key, uint key_len,
+ uint comp_flag, uchar **ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _mi_seq_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint key_len, uint comp_flag,
+ uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _mi_prefix_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
+ uchar *key, uint key_len, uint comp_flag,
+ uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern my_off_t _mi_kpos(uint nod_flag, uchar *after_key);
+extern void _mi_kpointer(MI_INFO *info, uchar *buff, my_off_t pos);
+extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag, uchar *after_key);
extern my_off_t _mi_rec_pos(MYISAM_SHARE *info, uchar *ptr);
-extern void _mi_dpointer(MI_INFO *info, uchar *buff,my_off_t pos);
-extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a,uchar *b,
- uint key_length,uint nextflag,uint *diff_length);
-extern uint _mi_get_static_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page,
- uchar *key);
-extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page,
- uchar *key);
+extern void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos);
+extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+ uint key_length, uint nextflag, uint *diff_length);
+extern uint _mi_get_static_key(MI_KEYDEF *keyinfo, uint nod_flag,
+ uchar **page, uchar *key);
+extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo, uint nod_flag, uchar **page,
+ uchar *key);
extern uint _mi_get_binary_pack_key(MI_KEYDEF *keyinfo, uint nod_flag,
- uchar **page_pos, uchar *key);
-extern uchar *_mi_get_last_key(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *keypos,
- uchar *lastkey,uchar *endpos,
- uint *return_key_length);
+ uchar ** page_pos, uchar *key);
+extern uchar *_mi_get_last_key(MI_INFO *info, MI_KEYDEF *keyinfo,
+ uchar *keypos, uchar *lastkey, uchar *endpos,
+ uint *return_key_length);
extern uchar *_mi_get_key(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
- uchar *key, uchar *keypos, uint *return_key_length);
-extern uint _mi_keylength(MI_KEYDEF *keyinfo,uchar *key);
+ uchar *key, uchar *keypos,
+ uint *return_key_length);
+extern uint _mi_keylength(MI_KEYDEF *keyinfo, uchar *key);
extern uint _mi_keylength_part(MI_KEYDEF *keyinfo, register uchar *key,
- HA_KEYSEG *end);
-extern uchar *_mi_move_key(MI_KEYDEF *keyinfo,uchar *to,uchar *from);
-extern int _mi_search_next(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
- uint key_length,uint nextflag,my_off_t pos);
-extern int _mi_search_first(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos);
-extern int _mi_search_last(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos);
-extern uchar *_mi_fetch_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page,
- int level,uchar *buff,int return_buffer);
-extern int _mi_write_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page,
- int level, uchar *buff);
-extern int _mi_dispose(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos,
- int level);
-extern my_off_t _mi_new(MI_INFO *info,MI_KEYDEF *keyinfo,int level);
-extern uint _mi_make_key(MI_INFO *info,uint keynr,uchar *key,
- const uchar *record,my_off_t filepos);
-extern uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key,
+ HA_KEYSEG *end);
+extern uchar *_mi_move_key(MI_KEYDEF *keyinfo, uchar *to, uchar *from);
+extern int _mi_search_next(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+ uint key_length, uint nextflag, my_off_t pos);
+extern int _mi_search_first(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos);
+extern int _mi_search_last(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos);
+extern uchar *_mi_fetch_keypage(MI_INFO *info, MI_KEYDEF *keyinfo,
+ my_off_t page, int level, uchar *buff,
+ int return_buffer);
+extern int _mi_write_keypage(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t page,
+ int level, uchar *buff);
+extern int _mi_dispose(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos,
+ int level);
+extern my_off_t _mi_new(MI_INFO *info, MI_KEYDEF *keyinfo, int level);
+extern uint _mi_make_key(MI_INFO *info, uint keynr, uchar *key,
+ const uchar *record, my_off_t filepos);
+extern uint _mi_pack_key(MI_INFO *info, uint keynr, uchar *key,
uchar *old, key_part_map keypart_map,
- HA_KEYSEG **last_used_keyseg);
-extern int _mi_read_key_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_read_cache(IO_CACHE *info,uchar *buff,my_off_t pos,
- uint length,int re_read_if_possibly);
-extern ulonglong retrieve_auto_increment(MI_INFO *info,const uchar *record);
+ HA_KEYSEG ** last_used_keyseg);
+extern int _mi_read_key_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos,
+ uint length, int re_read_if_possibly);
+extern ulonglong retrieve_auto_increment(MI_INFO *info, const uchar *record);
-extern uchar *mi_alloc_rec_buff(MI_INFO *,ulong, uchar**);
+extern uchar *mi_alloc_rec_buff(MI_INFO *, ulong, uchar **);
#define mi_get_rec_buff_ptr(info,buf) \
((((info)->s->options & HA_OPTION_PACK_RECORD) && (buf)) ? \
(buf) - MI_REC_BUFF_OFFSET : (buf))
#define mi_get_rec_buff_len(info,buf) \
(*((uint32 *)(mi_get_rec_buff_ptr(info,buf))))
-extern ulong _mi_rec_unpack(MI_INFO *info,uchar *to,uchar *from,
- ulong reclength);
+extern ulong _mi_rec_unpack(MI_INFO *info, uchar *to, uchar *from,
+ ulong reclength);
extern my_bool _mi_rec_check(MI_INFO *info,const uchar *record, uchar *packpos,
ulong packed_length, my_bool with_checkum);
-extern int _mi_write_part_record(MI_INFO *info,my_off_t filepos,ulong length,
- my_off_t next_filepos,uchar **record,
- ulong *reclength,int *flag);
-extern void _mi_print_key(FILE *stream,HA_KEYSEG *keyseg,const uchar *key,
- uint length);
-extern my_bool _mi_read_pack_info(MI_INFO *info,pbool fix_keys);
-extern int _mi_read_pack_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_read_rnd_pack_record(MI_INFO*, uchar *,my_off_t, my_bool);
+extern int _mi_write_part_record(MI_INFO *info, my_off_t filepos, ulong length,
+ my_off_t next_filepos, uchar ** record,
+ ulong *reclength, int *flag);
+extern void _mi_print_key(FILE *stream, HA_KEYSEG *keyseg, const uchar *key,
+ uint length);
+extern my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys);
+extern int _mi_read_pack_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_read_rnd_pack_record(MI_INFO *, uchar *, my_off_t, my_bool);
extern int _mi_pack_rec_unpack(MI_INFO *info, MI_BIT_BUFF *bit_buff,
uchar *to, uchar *from, ulong reclength);
-extern ulonglong mi_safe_mul(ulonglong a,ulonglong b);
+extern ulonglong mi_safe_mul(ulonglong a, ulonglong b);
extern int _mi_ft_update(MI_INFO *info, uint keynr, uchar *keybuf,
- const uchar *oldrec, const uchar *newrec, my_off_t pos);
+ const uchar *oldrec, const uchar *newrec,
+ my_off_t pos);
struct st_sort_info;
-typedef struct st_mi_block_info { /* Parameter to _mi_get_block_info */
+typedef struct st_mi_block_info /* Parameter to _mi_get_block_info */
+{
uchar header[MI_BLOCK_INFO_HEADER_LENGTH];
ulong rec_len;
ulong data_len;
@@ -655,35 +601,37 @@ typedef struct st_mi_block_info { /* Parameter to _mi_get_block_info */
uint offset;
} MI_BLOCK_INFO;
- /* bits in return from _mi_get_block_info */
-
-#define BLOCK_FIRST 1
-#define BLOCK_LAST 2
-#define BLOCK_DELETED 4
-#define BLOCK_ERROR 8 /* Wrong data */
-#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */
-#define BLOCK_FATAL_ERROR 32 /* hardware-error */
-
-#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
-#define MAXERR 20
-#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */
-#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
-#define INDEX_TMP_EXT ".TMM"
-#define DATA_TMP_EXT ".TMD"
-
-#define UPDATE_TIME 1
-#define UPDATE_STAT 2
-#define UPDATE_SORT 4
-#define UPDATE_AUTO_INC 8
-#define UPDATE_OPEN_COUNT 16
-
-#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE)
-#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD)
-#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD)
-#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD)
-
-enum myisam_log_commands {
- MI_LOG_OPEN,MI_LOG_WRITE,MI_LOG_UPDATE,MI_LOG_DELETE,MI_LOG_CLOSE,MI_LOG_EXTRA,MI_LOG_LOCK,MI_LOG_DELETE_ALL
+ /* bits in return from _mi_get_block_info */
+
+#define BLOCK_FIRST 1
+#define BLOCK_LAST 2
+#define BLOCK_DELETED 4
+#define BLOCK_ERROR 8 /* Wrong data */
+#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */
+#define BLOCK_FATAL_ERROR 32 /* hardware-error */
+
+#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
+#define MAXERR 20
+#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */
+#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
+#define INDEX_TMP_EXT ".TMM"
+#define DATA_TMP_EXT ".TMD"
+
+#define UPDATE_TIME 1
+#define UPDATE_STAT 2
+#define UPDATE_SORT 4
+#define UPDATE_AUTO_INC 8
+#define UPDATE_OPEN_COUNT 16
+
+#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE)
+#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD)
+#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD)
+#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD)
+
+enum myisam_log_commands
+{
+ MI_LOG_OPEN, MI_LOG_WRITE, MI_LOG_UPDATE, MI_LOG_DELETE, MI_LOG_CLOSE,
+ MI_LOG_EXTRA, MI_LOG_LOCK, MI_LOG_DELETE_ALL
};
#define myisam_log(a,b,c,d) if (myisam_log_file >= 0) _myisam_log(a,b,c,d)
@@ -693,44 +641,42 @@ enum myisam_log_commands {
#define fast_mi_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _mi_writeinfo((INFO),0)
#define fast_mi_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _mi_readinfo((INFO),F_RDLCK,1)
-#ifdef __cplusplus
+#ifdef __cplusplus
extern "C" {
#endif
-
-extern uint _mi_get_block_info(MI_BLOCK_INFO *,File, my_off_t);
-extern uint _mi_rec_pack(MI_INFO *info,uchar *to,const uchar *from);
+ extern uint _mi_get_block_info(MI_BLOCK_INFO *, File, my_off_t);
+extern uint _mi_rec_pack(MI_INFO *info, uchar *to, const uchar *from);
extern uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
MI_BLOCK_INFO *info, uchar **rec_buff_p,
File file, my_off_t filepos);
-extern void _my_store_blob_length(uchar *pos,uint pack_length,uint length);
-extern void _myisam_log(enum myisam_log_commands command,MI_INFO *info,
- const uchar *buffert,uint length);
+extern void _mi_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _myisam_log(enum myisam_log_commands command, MI_INFO *info,
+ const uchar *buffert, uint length);
extern void _myisam_log_command(enum myisam_log_commands command,
- MI_INFO *info, const uchar *buffert,
- uint length, int result);
-extern void _myisam_log_record(enum myisam_log_commands command,MI_INFO *info,
- const uchar *record,my_off_t filepos,
- int result);
+ MI_INFO *info, const uchar *buffert,
+ uint length, int result);
+extern void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info,
+ const uchar *record, my_off_t filepos,
+ int result);
extern void mi_report_error(int errcode, const char *file_name);
extern my_bool _mi_memmap_file(MI_INFO *info);
extern void _mi_unmap_file(MI_INFO *info);
extern uint save_pack_length(uint version, uchar *block_buff, ulong length);
-extern uint read_pack_length(uint version, const uchar *buf, ulong *length);
extern uint calc_pack_length(uint version, ulong length);
extern size_t mi_mmap_pread(MI_INFO *info, uchar *Buffer,
- size_t Count, my_off_t offset, myf MyFlags);
+ uint Count, my_off_t offset, myf MyFlags);
extern size_t mi_mmap_pwrite(MI_INFO *info, const uchar *Buffer,
- size_t Count, my_off_t offset, myf MyFlags);
+ uint Count, my_off_t offset, myf MyFlags);
extern size_t mi_nommap_pread(MI_INFO *info, uchar *Buffer,
- size_t Count, my_off_t offset, myf MyFlags);
+ uint Count, my_off_t offset, myf MyFlags);
extern size_t mi_nommap_pwrite(MI_INFO *info, const uchar *Buffer,
- size_t Count, my_off_t offset, myf MyFlags);
+ uint Count, my_off_t offset, myf MyFlags);
uint mi_state_info_write(File file, MI_STATE_INFO *state, uint pWrite);
uchar *mi_state_info_read(uchar *ptr, MI_STATE_INFO *state);
uint mi_state_info_read_dsk(File file, MI_STATE_INFO *state, my_bool pRead);
uint mi_base_info_write(File file, MI_BASE_INFO *base);
-uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base);
+uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base);
int mi_keyseg_write(File file, const HA_KEYSEG *keyseg);
uchar *mi_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg);
uint mi_keydef_write(File file, MI_KEYDEF *keydef);
@@ -742,23 +688,23 @@ uchar *mi_recinfo_read(uchar *ptr, MI_COLUMNDEF *recinfo);
extern int mi_disable_indexes(MI_INFO *info);
extern int mi_enable_indexes(MI_INFO *info);
extern int mi_indexes_are_disabled(MI_INFO *info);
-ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record);
+ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record);
ha_checksum mi_checksum(MI_INFO *info, const uchar *buf);
ha_checksum mi_static_checksum(MI_INFO *info, const uchar *buf);
my_bool mi_check_unique(MI_INFO *info, MI_UNIQUEDEF *def, uchar *record,
- ha_checksum unique_hash, my_off_t pos);
+ ha_checksum unique_hash, my_off_t pos);
ha_checksum mi_unique_hash(MI_UNIQUEDEF *def, const uchar *buf);
int _mi_cmp_static_unique(MI_INFO *info, MI_UNIQUEDEF *def,
- const uchar *record, my_off_t pos);
+ const uchar *record, my_off_t pos);
int _mi_cmp_dynamic_unique(MI_INFO *info, MI_UNIQUEDEF *def,
- const uchar *record, my_off_t pos);
+ const uchar *record, my_off_t pos);
int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b,
- my_bool null_are_equal);
-void mi_get_status(void* param, int concurrent_insert);
-void mi_update_status(void* param);
-void mi_restore_status(void* param);
-void mi_copy_status(void* to,void *from);
-my_bool mi_check_status(void* param);
+ my_bool null_are_equal);
+void mi_get_status(void *param, int concurrent_insert);
+void mi_update_status(void *param);
+void mi_restore_status(void *param);
+void mi_copy_status(void *to, void *from);
+my_bool mi_check_status(void *param);
void mi_disable_non_unique_index(MI_INFO *info, ha_rows rows);
extern MI_INFO *test_if_reopen(char *filename);
@@ -770,22 +716,14 @@ my_bool mi_dynmap_file(MI_INFO *info, my_off_t size);
void mi_remap_file(MI_INFO *info, my_off_t size);
/* Functions needed by mi_check */
-volatile int *killed_ptr(MI_CHECK *param);
-void mi_check_print_error _VARARGS((MI_CHECK *param, const char *fmt,...));
-void mi_check_print_warning _VARARGS((MI_CHECK *param, const char *fmt,...));
-void mi_check_print_info _VARARGS((MI_CHECK *param, const char *fmt,...));
-int flush_pending_blocks(MI_SORT_PARAM *param);
-int sort_ft_buf_flush(MI_SORT_PARAM *sort_param);
-int thr_write_keys(MI_SORT_PARAM *sort_param);
+volatile int *killed_ptr(HA_CHECK *param);
+void mi_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...));
+void mi_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...));
+void mi_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...));
#ifdef THREAD
pthread_handler_t thr_find_all_keys(void *arg);
#endif
-int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file);
-
-int sort_write_record(MI_SORT_PARAM *sort_param);
-int _create_index_by_sort(MI_SORT_PARAM *info,my_bool no_messages, ulong);
-
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file);
#ifdef __cplusplus
}
#endif
-
diff --git a/storage/myisam/myisamlog.c b/storage/myisam/myisamlog.c
index 6566a7a7a02..04c0d9543d7 100644
--- a/storage/myisam/myisamlog.c
+++ b/storage/myisam/myisamlog.c
@@ -808,7 +808,7 @@ static int find_record_with_key(struct file_info *file_info, uchar *record)
{
uint key;
MI_INFO *info=file_info->isam;
- uchar tmp_key[MI_MAX_KEY_BUFF];
+ uchar tmp_key[HA_MAX_KEY_BUFF];
for (key=0 ; key < info->s->base.keys ; key++)
{
diff --git a/storage/myisam/myisampack.c b/storage/myisam/myisampack.c
index 37428ddd279..841fb45c184 100644
--- a/storage/myisam/myisampack.c
+++ b/storage/myisam/myisampack.c
@@ -305,7 +305,7 @@ static void usage(void)
puts("and you are welcome to modify and redistribute it under the GPL license\n");
puts("Pack a MyISAM-table to take much less space.");
- puts("Keys are not updated, you must run myisamchk -rq on the datafile");
+ puts("Keys are not updated, you must run myisamchk -rq on the index (.MYI) file");
puts("afterwards to update the keys.");
puts("You should give the .MYI file as the filename argument.");
@@ -1008,7 +1008,7 @@ static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
/* Calculate pos, end_pos, and max_length for variable length fields. */
if (count->field_type == FIELD_BLOB)
{
- uint field_length=count->field_length -mi_portable_sizeof_char_ptr;
+ uint field_length=count->field_length -portable_sizeof_char_ptr;
ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
end_pos=pos+blob_length;
@@ -2650,7 +2650,7 @@ static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
case FIELD_BLOB:
{
ulong blob_length=_mi_calc_blob_length(field_length-
- mi_portable_sizeof_char_ptr,
+ portable_sizeof_char_ptr,
start_pos);
/* Empty blobs are encoded with a single 1 bit. */
if (!blob_length)
@@ -2667,7 +2667,7 @@ static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
blob_length, count->length_bits));
write_bits(blob_length,count->length_bits);
- memcpy_fixed(&blob,end_pos-mi_portable_sizeof_char_ptr,
+ memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
sizeof(char*));
blob_end=blob+blob_length;
/* Encode the blob bytes. */
diff --git a/storage/myisam/rt_index.c b/storage/myisam/rt_index.c
index 63ed60586d6..35a70b8c2bf 100644
--- a/storage/myisam/rt_index.c
+++ b/storage/myisam/rt_index.c
@@ -542,7 +542,7 @@ static int rtree_insert_req(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
DBUG_ENTER("rtree_insert_req");
if (!(page_buf = (uchar*)my_alloca((uint)keyinfo->block_length +
- MI_MAX_KEY_BUFF)))
+ HA_MAX_KEY_BUFF)))
{
my_errno = HA_ERR_OUT_OF_MEM;
DBUG_RETURN(-1); /* purecov: inspected */
@@ -658,7 +658,7 @@ static int rtree_insert_level(MI_INFO *info, uint keynr, uchar *key,
DBUG_PRINT("rtree", ("root was split, grow a new root"));
if (!(new_root_buf = (uchar*)my_alloca((uint)keyinfo->block_length +
- MI_MAX_KEY_BUFF)))
+ HA_MAX_KEY_BUFF)))
{
my_errno = HA_ERR_OUT_OF_MEM;
DBUG_RETURN(-1); /* purecov: inspected */
diff --git a/storage/myisam/sort.c b/storage/myisam/sort.c
index 2146a8d16cb..f11014570e6 100644
--- a/storage/myisam/sort.c
+++ b/storage/myisam/sort.c
@@ -15,7 +15,7 @@
/*
Creates a index for a database by reading keys, sorting them and outputing
- them in sorted order through SORT_INFO functions.
+ them in sorted order through MI_SORT_INFO functions.
*/
#include "fulltext.h"
@@ -487,8 +487,8 @@ ok:
int thr_write_keys(MI_SORT_PARAM *sort_param)
{
- SORT_INFO *sort_info=sort_param->sort_info;
- MI_CHECK *param=sort_info->param;
+ MI_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
ulong length, keys;
ulong *rec_per_key_part=param->rec_per_key_part;
int got_error=sort_info->got_error;
@@ -918,7 +918,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
{
count+= buffpek->count;
- buffpek->base= strpos;
+ buffpek->base= (uchar*) strpos;
buffpek->max_keys=maxcount;
strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek,
sort_length));
@@ -956,7 +956,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
{
if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length)))
{
- uchar *base=buffpek->base;
+ uchar *base= buffpek->base;
uint max_keys=buffpek->max_keys;
VOID(queue_remove(&queue,0));
@@ -988,7 +988,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
}
}
buffpek=(BUFFPEK*) queue_top(&queue);
- buffpek->base=(uchar *) sort_keys;
+ buffpek->base= (uchar*) sort_keys;
buffpek->max_keys=keys;
do
{
@@ -1003,7 +1003,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
else
{
register uchar *end;
- strpos= buffpek->key;
+ strpos= (uchar*) buffpek->key;
for (end=strpos+buffpek->mem_count*sort_length;
strpos != end ;
strpos+=sort_length)
diff --git a/storage/myisam/sp_test.c b/storage/myisam/sp_test.c
index dee32ba423e..f572c7ab19b 100644
--- a/storage/myisam/sp_test.c
+++ b/storage/myisam/sp_test.c
@@ -79,7 +79,7 @@ int run_test(const char *filename)
/* Define spatial column */
recinfo[1].type=FIELD_BLOB;
- recinfo[1].length=4 + mi_portable_sizeof_char_ptr;
+ recinfo[1].length=4 + portable_sizeof_char_ptr;
diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc
index 8a914e8a2de..b8119466d11 100644
--- a/storage/myisammrg/ha_myisammrg.cc
+++ b/storage/myisammrg/ha_myisammrg.cc
@@ -48,9 +48,11 @@ static const char *ha_myisammrg_exts[] = {
};
extern int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
MI_COLUMNDEF **recinfo_out, uint *records_out);
-extern int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo,
+extern int check_definition(MI_KEYDEF *t1_keyinfo,
+ MI_COLUMNDEF *t1_recinfo,
uint t1_keys, uint t1_recs,
- MI_KEYDEF *t2_keyinfo, MI_COLUMNDEF *t2_recinfo,
+ MI_KEYDEF *t2_keyinfo,
+ MI_COLUMNDEF *t2_recinfo,
uint t2_keys, uint t2_recs, bool strict);
static void split_file_name(const char *file_name,
LEX_STRING *db, LEX_STRING *name);
@@ -390,7 +392,8 @@ int ha_myisammrg::extra(enum ha_extra_function operation)
/* As this is just a mapping, we don't have to force the underlying
tables to be closed */
if (operation == HA_EXTRA_FORCE_REOPEN ||
- operation == HA_EXTRA_PREPARE_FOR_DELETE)
+ operation == HA_EXTRA_PREPARE_FOR_DROP ||
+ operation == HA_EXTRA_PREPARE_FOR_RENAME)
return 0;
return myrg_extra(file,operation,0);
}
diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h
index 91aabe277f7..1207ca96851 100644
--- a/storage/myisammrg/ha_myisammrg.h
+++ b/storage/myisammrg/ha_myisammrg.h
@@ -47,8 +47,8 @@ class ha_myisammrg: public handler
HA_READ_ORDER | HA_KEYREAD_ONLY);
}
uint max_supported_keys() const { return MI_MAX_KEY; }
- uint max_supported_key_length() const { return MI_MAX_KEY_LENGTH; }
- uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; }
+ uint max_supported_key_length() const { return HA_MAX_KEY_LENGTH; }
+ uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; }
double scan_time()
{ return ulonglong2double(stats.data_file_length) / IO_SIZE + file->tables; }