summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <guilhem@gbichot4.local>2007-09-15 14:45:26 +0200
committerunknown <guilhem@gbichot4.local>2007-09-15 14:45:26 +0200
commit9c2ff270fa725954d91f6f3d13b0aeb9b3960f47 (patch)
treeaffec05f6a80f5455080062cb4cf70f37abb9b4c
parent8829fa646403e87c6b1f26edeca5f91c021eade0 (diff)
downloadmariadb-git-9c2ff270fa725954d91f6f3d13b0aeb9b3960f47.tar.gz
WL#3072 Maria Recovery
* recovery from ha_maria now skips replaying DDLs (too dangerous) * maria_read_log still replays DDLs, print warning about issues * fixes to replaying of REDO_RENAME * don't replay DDLs on corrupted tables (safer) * print a one-line message when really doing a recovery (applies to ha_maria, not maria_read_log) i.e. some REDOs or UNDOs are read. storage/maria/ma_checkpoint.c: fix for assertion failure storage/maria/ma_recovery.c: * Recovery from ha_maria now skips replaying DDLs (as the initial plan said) as this is unsafe in case of crashes during the DDL; applying the records may do harm (destroy important files) so we prefer to leave the "mess" of files untouched. A proper recovery of DDLs requires very careful thinking, probably testing separately the existence of the data and index file instead of using maria_open() which tests the existence of both, and maybe storing create_rename_lsn in the data file too. * maria_read_log still replays DDLs, we print a warning about dangers (due to ALTER TABLE not logging insertions into the tmp table; we will maybe need an option to have logging of those insertions). * fixes to replaying of REDO_RENAME (test create_rename_lsn of 'new_name' table if it exists; if that table exists and is more recent than the record, remove the 'old_name' table). * don't replay DDLs on corrupted tables (play safe) * fail also in non-debug builds if table is open when it should not be (when creating it for example, it should not be already open). * when the trace file is not stdout (i.e. when this is ha_maria), if really doing a recovery (reading REDOs or UNDOs), print a one-line message to stderr to inform about start and end of recovery (useful to know what mysqld is doing, especially if it takes long or crashes). storage/maria/ma_recovery.h: parameter to replay DDLs or not storage/maria/maria_read_log.c: replay DDLs in maria_read_log, to be able to recreate tables from scratch.
-rw-r--r--storage/maria/ma_checkpoint.c8
-rw-r--r--storage/maria/ma_recovery.c240
-rw-r--r--storage/maria/ma_recovery.h2
-rw-r--r--storage/maria/maria_read_log.c2
4 files changed, 227 insertions, 25 deletions
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
index 8c3f2c0a2e2..aa291fe6c97 100644
--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -123,7 +123,13 @@ int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
int result= 0;
DBUG_ENTER("ma_checkpoint_execute");
- DBUG_ASSERT(checkpoint_inited);
+ if (!checkpoint_inited)
+ {
+ /*
+ If ha_maria failed to start, maria_panic_hton is called, we come here.
+ */
+ DBUG_RETURN(0);
+ }
DBUG_ASSERT(level > CHECKPOINT_NONE);
/* look for already running checkpoints */
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
index 0f831ae63b1..2f951b0b776 100644
--- a/storage/maria/ma_recovery.c
+++ b/storage/maria/ma_recovery.c
@@ -49,6 +49,7 @@ static LSN current_group_end_lsn,
checkpoint_start= LSN_IMPOSSIBLE;
static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */
static FILE *tracef; /**< trace file for debugging */
+static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
#define prototype_redo_exec_hook(R) \
static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec)
@@ -117,7 +118,23 @@ static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
MYF(MY_WME | MY_ALLOW_ZERO_PTR));
}
}
-
+static my_bool recovery_message_printed;
+static inline void print_recovery_message()
+{
+ /*
+ If we're really doing a recovery (reading REDOs or UNDOs), we print a
+ one-line message when we start it and when we end it. It goes to stderr,
+ not tracef, so that it is visible in the error log (soon we should maybe
+ use sql_print_error). We don't print if if tracef is stdout as stdout will
+ be seen by the user and thus convey sufficient info already.
+ */
+ if (!recovery_message_printed && (tracef != stdout))
+ {
+ recovery_message_printed= TRUE;
+ /** @todo RECOVERY BUG all prints to stderr should go to error log */
+ fprintf(stderr, "Maria engine: starting recovery\n");
+ }
+}
#define ALERT_USER() DBUG_ASSERT(0)
@@ -147,7 +164,7 @@ int maria_recover()
{
fprintf(trace_file, "TRACE of the last MARIA recovery from mysqld\n");
DBUG_ASSERT(maria_pagecache->inited);
- res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE);
+ res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE, TRUE);
if (!res)
fprintf(trace_file, "SUCCESS\n");
fclose(trace_file);
@@ -164,6 +181,8 @@ int maria_recover()
LSN_IMPOSSIBLE means "use last checkpoint"
@param apply if log records should be applied or not
@param trace_file trace file where progress/debug messages will go
+ @param skip_DDLs Should DDL records (CREATE/RENAME/DROP/REPAIR)
+ be skipped by the REDO phase or not
@todo This trace_file thing is primitive; soon we will make it similar to
ma_check_print_warning() etc, and a successful recovery does not need to
@@ -175,7 +194,7 @@ int maria_recover()
*/
int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
- my_bool should_run_undo_phase)
+ my_bool should_run_undo_phase, my_bool skip_DDLs_arg)
{
int error= 0;
uint unfinished_trans;
@@ -192,7 +211,33 @@ int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
if (!all_active_trans || !all_tables)
goto err;
+ recovery_message_printed= FALSE;
tracef= trace_file;
+ if (!(skip_DDLs= skip_DDLs_arg))
+ {
+ /*
+ Example of what can go wrong when replaying DDLs:
+ CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged);
+ ALTER TABLE t ... which does
+ CREATE a temporary table #sql... (logged)
+ INSERT data from t into #sql... (not logged)
+ RENAME #sql TO t (logged)
+ Removing tables by hand and replaying the log will leave in the
+ end an empty table "t": missing records. If after the RENAME an INSERT
+ into t was done, that row had number 1 in its page, executing the
+ REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion
+ failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is
+ created whereas rownr is not 0).
+ Another issue is that replaying of DDLs is not correct enough to work if
+ there was a crash during a DDL (see comment in execution of
+ REDO_RENAME_TABLE ).
+ */
+ fprintf(tracef, "WARNING: MySQL server currently disables log records"
+ " about insertion of data by ALTER TABLE"
+ " (copy_data_between_tables()), applying of log records may"
+ " well not work. Additionally, applying of DDL records will"
+ " cause damage if there are tables left by a crash of a DDL.\n");
+ }
if (from_lsn == LSN_IMPOSSIBLE)
{
@@ -245,6 +290,8 @@ int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file,
goto err;
/* If inside ha_maria, a checkpoint will soon be taken and save our work */
+ if (recovery_message_printed && (tracef != stdout))
+ fprintf(stderr, "Maria engine: finished recovery\n");
goto end;
err:
error= 1;
@@ -365,6 +412,11 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
uint flags;
int error= 1, create_mode= O_RDWR | O_TRUNC;
MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ fprintf(tracef, "we skip DDLs\n");
+ return 0;
+ }
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
@@ -382,7 +434,12 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
{
MARIA_SHARE *share= info->s;
/* check that we're not already using it */
- DBUG_ASSERT(share->reopen == 1);
+ if (share->reopen != 1)
+ {
+ fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
if (!share->base.born_transactional)
{
@@ -391,7 +448,7 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
one was renamed to its name, thus create_rename_lsn is 0 and should
not be trusted.
*/
- fprintf(tracef, ", is not transactional\n");
+ fprintf(tracef, ", is not transactional, ignoring creation\n");
ALERT_USER();
error= 0;
goto end;
@@ -406,13 +463,16 @@ prototype_redo_exec_hook(REDO_CREATE_TABLE)
}
if (maria_is_crashed(info))
{
- fprintf(tracef, ", is crashed, overwriting it");
+ fprintf(tracef, ", is crashed, can't recreate it");
ALERT_USER();
+ goto end;
}
maria_close(info);
info= NULL;
}
- /* if does not exist, is older, or its header is corrupted, overwrite it */
+ else /* one or two files absent, or header corrupted... */
+ fprintf(tracef, "can't be opened, probably does not exist");
+ /* if does not exist, or is older, overwrite it */
/** @todo symlinks */
ptr= name + strlen(name) + 1;
if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0))
@@ -490,6 +550,11 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
char *old_name, *new_name;
int error= 1;
MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ fprintf(tracef, "we skip DDLs\n");
+ return 0;
+ }
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
@@ -501,7 +566,36 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
}
old_name= log_record_buffer.str;
new_name= old_name + strlen(old_name) + 1;
- fprintf(tracef, "Table '%s' to rename to '%s'", old_name, new_name);
+ fprintf(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
+ new_name);
+ /*
+ Here is why we skip CREATE/DROP/RENAME when doing a recovery from
+ ha_maria (whereas we do when called from maria_read_log). Consider:
+ CREATE TABLE t;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u; # crash between index rename and data rename.
+ And do a Recovery (not removing tables beforehand).
+ Recovery replays CREATE, then RENAME: the maria_open("t") works,
+ maria_open("u") does not (no data file) so table "u" is considered
+ inexistent and so maria_rename() is done which overwrites u's index file,
+ which is lost. Ok, the data file (v.MAD) is still available, but only a
+ REPAIR USE_FRM can rebuild the index, which is unsafe and downtime.
+ So it is preferrable to not execute RENAME, and leave the "mess" of files,
+ rather than possibly destroy a file. DBA will manually rename files.
+ A safe recovery method would probably require checking the existence of
+ the index file and of the data file separately (not via maria_open()), and
+ maybe also to store a create_rename_lsn in the data file too
+ For now, all we risk is to leave the mess (half-renamed files) left by the
+ crash. We however sync files and directories at each file rename. The SQL
+ layer is anyway not crash-safe for DDLs (except the repartioning-related
+ ones).
+ We replay DDLs in maria_read_log to be able to recreate tables from
+ scratch. It means that "maria_read_log -a" should not be used on a
+ database which just crashed during a DDL. And also ALTER TABLE does not
+ log insertions of records into the temporary table, so replaying may
+ fail (see comment and warning in maria_apply_log()).
+ */
info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
if (info)
{
@@ -512,7 +606,7 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
*/
if (!share->base.born_transactional)
{
- fprintf(tracef, ", is not transactional\n");
+ fprintf(tracef, ", is not transactional, ignoring renaming\n");
ALERT_USER();
error= 0;
goto end;
@@ -540,7 +634,76 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
maria_close(info))
goto end;
info= NULL;
+ fprintf(tracef, ", is ok for renaming; new-name table ");
+ }
+ else /* one or two files absent, or header corrupted... */
+ {
+ fprintf(tracef, ", can't be opened, probably does not exist");
+ error= 0;
+ goto end;
}
+ /*
+ We must also check the create_rename_lsn of the 'new_name' table if it
+ exists: otherwise we may, with our rename which overwrites, destroy
+ another table. For example:
+ CREATE TABLE t;
+ RENAME t to u;
+ DROP TABLE u;
+ RENAME v to u; # v is an old table, its creation/insertions not in log
+ And start executing the log (without removing tables beforehand): creates
+ t, renames it to u (if not testing create_rename_lsn) thus overwriting
+ old-named v, drops u, and we are stuck, we have lost data.
+ */
+ info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /* We should not have open instances on this table. */
+ if (share->reopen != 1)
+ {
+ fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ if (!share->base.born_transactional)
+ {
+ fprintf(tracef, ", is not transactional, ignoring renaming\n");
+ ALERT_USER();
+ goto drop;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ fprintf(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring renaming",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ /*
+ We have to drop the old_name table. Consider:
+ CREATE TABLE t;
+ CREATE TABLE v;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u;
+ and apply the log without removing tables beforehand. t will be
+ created, v too; in REDO_RENAME u will be more recent, but we still
+ have to drop t otherwise it stays.
+ */
+ goto drop;
+ }
+ if (maria_is_crashed(info))
+ {
+ fprintf(tracef, ", is crashed, can't rename it");
+ ALERT_USER();
+ goto end;
+ }
+ if (maria_close(info))
+ goto end;
+ info= NULL;
+ /* abnormal situation */
+ fprintf(tracef, ", exists but is older than record, can't rename it");
+ goto end;
+ }
+ else /* one or two files absent, or header corrupted... */
+ fprintf(tracef, ", can't be opened, probably does not exist");
fprintf(tracef, ", renaming '%s'", old_name);
if (maria_rename(old_name, new_name))
{
@@ -559,6 +722,16 @@ prototype_redo_exec_hook(REDO_RENAME_TABLE)
goto end;
info= NULL;
error= 0;
+ goto end;
+drop:
+ fprintf(tracef, ", only dropping '%s'", old_name);
+ if (maria_delete_table(old_name))
+ {
+ fprintf(tracef, "Failed to drop table\n");
+ goto end;
+ }
+ error= 0;
+ goto end;
end:
fprintf(tracef, "\n");
if (info != NULL)
@@ -573,8 +746,17 @@ end:
prototype_redo_exec_hook(REDO_REPAIR_TABLE)
{
int error= 1;
- MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
- if (info == NULL)
+ MARIA_HA *info;
+ if (skip_DDLs)
+ {
+ /*
+ REPAIR is not exactly a DDL, but it manipulates files without logging
+ insertions into them.
+ */
+ fprintf(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
return 0;
/*
Otherwise, the mapping is newer than the table, and our record is newer
@@ -610,6 +792,11 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
char *name;
int error= 1;
MARIA_HA *info= NULL;
+ if (skip_DDLs)
+ {
+ fprintf(tracef, "we skip DDLs\n");
+ return 0;
+ }
enlarge_buffer(rec);
if (log_record_buffer.str == NULL ||
translog_read_record(rec->lsn, 0, rec->record_length,
@@ -631,7 +818,7 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
*/
if (!share->base.born_transactional)
{
- fprintf(tracef, ", is not transactional\n");
+ fprintf(tracef, ", is not transactional, ignoring removal\n");
ALERT_USER();
error= 0;
goto end;
@@ -646,8 +833,9 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
}
if (maria_is_crashed(info))
{
- fprintf(tracef, ", is crashed, dropping it");
+ fprintf(tracef, ", is crashed, can't drop it");
ALERT_USER();
+ goto end;
}
/*
This maria_extra() call serves to signal that old open instances of
@@ -658,14 +846,16 @@ prototype_redo_exec_hook(REDO_DROP_TABLE)
maria_close(info))
goto end;
info= NULL;
+ /* if it is older, or its header is corrupted, drop it */
+ fprintf(tracef, ", dropping '%s'", name);
+ if (maria_delete_table(name))
+ {
+ fprintf(tracef, "Failed to drop table\n");
+ goto end;
+ }
}
- /* if does not exist, is older, or its header is corrupted, drop it */
- fprintf(tracef, ", dropping '%s'", name);
- if (maria_delete_table(name))
- {
- fprintf(tracef, "Failed to drop table\n");
- goto end;
- }
+ else /* one or two files absent, or header corrupted... */
+ fprintf(tracef,", can't be opened, probably does not exist");
error= 0;
end:
fprintf(tracef, "\n");
@@ -753,7 +943,12 @@ static int new_table(uint16 sid, const char *name,
}
MARIA_SHARE *share= info->s;
/* check that we're not already using it */
- DBUG_ASSERT(share->reopen == 1);
+ if (share->reopen != 1)
+ {
+ fprintf(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
if (!share->base.born_transactional)
{
@@ -1294,7 +1489,6 @@ static int run_redo_phase(LSN lsn, my_bool apply)
uint16 sid= rec.short_trid;
const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type];
display_record_position(log_desc, &rec, i);
-
/*
A complete group is a set of log records with an "end mark" record
(e.g. a set of REDOs for an operation, terminated by an UNDO for this
@@ -1572,6 +1766,7 @@ static MARIA_HA *get_MARIA_HA_from_REDO_record(const
MARIA_HA *info;
char llbuf[22];
+ print_recovery_message();
sid= fileid_korr(rec->header);
page= page_korr(rec->header + FILEID_STORE_SIZE);
/**
@@ -1643,6 +1838,7 @@ static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
uint16 sid;
MARIA_HA *info;
+ print_recovery_message();
sid= fileid_korr(rec->header + LSN_STORE_SIZE);
fprintf(tracef, " For table of short id %u", sid);
info= all_tables[sid].info;
diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h
index 9a5a2b3099e..6a4b359be4d 100644
--- a/storage/maria/ma_recovery.h
+++ b/storage/maria/ma_recovery.h
@@ -26,5 +26,5 @@
C_MODE_START
int maria_recover();
int maria_apply_log(LSN lsn, my_bool apply, FILE *trace_file,
- my_bool execute_undo_phase);
+ my_bool execute_undo_phase, my_bool skip_DDLs);
C_MODE_END
diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c
index dc537695739..a7a6370b1c4 100644
--- a/storage/maria/maria_read_log.c
+++ b/storage/maria/maria_read_log.c
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
fprintf(stdout, "TRACE of the last maria_read_log\n");
if (maria_apply_log(lsn, opt_display_and_apply, stdout,
- opt_display_and_apply))
+ opt_display_and_apply, FALSE))
goto err;
fprintf(stdout, "%s: SUCCESS\n", my_progname);