diff options
author | Alfranio Correia <alfranio.correia@sun.com> | 2009-09-29 15:40:52 +0100 |
---|---|---|
committer | Alfranio Correia <alfranio.correia@sun.com> | 2009-09-29 15:40:52 +0100 |
commit | 3ab71376ceb2d5da81d3b6fb092630d0b0929d76 (patch) | |
tree | 5227fe1804bac85dca21ad7baf732306e7d30679 /sql | |
parent | 0110bd04d24503d84df93d31b444586c4137c98c (diff) | |
download | mariadb-git-3ab71376ceb2d5da81d3b6fb092630d0b0929d76.tar.gz |
BUG#40337 Fsyncing master and relay log to disk after every event is too slow
NOTE: Backporting the patch to next-mr.
The fix proposed in BUG#35542 and BUG#31665 introduces a performance issue
when fsyncing the master.info, relay.info and relay-log.bin* after #th events.
Although such solution has been proposed to reduce the probability of corrupted
files due to a slave-crash, the performance penalty introduced by it has
made the approach impractical for highly intensive workloads.
In a nutshell, the option --syn-relay-log proposed in BUG#35542 and BUG#31665
simultaneously fsyncs master.info, relay-log.info and relay-log.bin* and
this is the main source of performance issues.
This patch introduces new options that give more control to the user on
what should be fsynced and how often:
1) (--sync-master-info, integer) which syncs the master.info after #th event;
2) (--sync-relay-log, integer) which syncs the relay-log.bin* after #th
events.
3) (--sync-relay-log-info, integer) which syncs the relay.info after #th
transactions.
To provide both performance and increased reliability, we recommend the following
setup:
1) --sync-master-info = 0 eventually the operating system will fsync it;
2) --sync-relay-log = 0 eventually the operating system will fsync it;
3) --sync-relay-log-info = 1 fsyncs it after every transaction;
Notice, that the previous setup does not reduce the probability of
corrupted master.info and relay-log.bin*. To overcome the issue, this patch also
introduces a recovery mechanism that right after restart throws away relay-log.bin*
retrieved from a master and updates the master.info based on the relay.info:
4) (--relay-log-recovery, boolean) which enables a recovery mechanism that
throws away relay-log.bin* after a crash.
However, it can only recover the incorrect binlog file and position in master.info,
if other informations (host, port password, etc) are corrupted or incorrect,
then this recovery mechanism will fail to work.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/mysql_priv.h | 4 | ||||
-rw-r--r-- | sql/mysqld.cc | 26 | ||||
-rw-r--r-- | sql/rpl_mi.cc | 17 | ||||
-rw-r--r-- | sql/rpl_mi.h | 9 | ||||
-rw-r--r-- | sql/rpl_rli.cc | 8 | ||||
-rw-r--r-- | sql/rpl_rli.h | 15 | ||||
-rw-r--r-- | sql/set_var.cc | 8 | ||||
-rw-r--r-- | sql/set_var.h | 4 | ||||
-rw-r--r-- | sql/slave.cc | 102 | ||||
-rw-r--r-- | sql/sql_binlog.cc | 2 | ||||
-rw-r--r-- | sql/sql_repl.cc | 12 |
11 files changed, 175 insertions, 32 deletions
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 669942cc691..435513832d0 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -1869,10 +1869,12 @@ extern ulong MYSQL_PLUGIN_IMPORT specialflag; #ifdef MYSQL_SERVER extern ulong current_pid; extern ulong expire_logs_days; -extern uint sync_binlog_period, sync_relaylog_period; +extern uint sync_binlog_period, sync_relaylog_period, + sync_relayloginfo_period, sync_masterinfo_period; extern ulong opt_tc_log_size, tc_log_max_pages_used, tc_log_page_size; extern ulong tc_log_page_waits; extern my_bool relay_log_purge, opt_innodb_safe_binlog, opt_innodb; +extern my_bool relay_log_recovery; extern uint test_flags,select_errors,ha_open_options; extern uint protocol_version, mysqld_port, dropping_tables; extern uint delay_key_write_options; diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 8febc0bb7e5..b8d09fd4e5a 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -477,6 +477,7 @@ extern const char *opt_ndb_distribution; extern enum ndb_distribution opt_ndb_distribution_id; #endif my_bool opt_readonly, use_temp_pool, relay_log_purge; +my_bool relay_log_recovery; my_bool opt_sync_frm, opt_allow_suspicious_udfs; my_bool opt_secure_auth= 0; char* opt_secure_file_priv= 0; @@ -553,7 +554,8 @@ ulong max_prepared_stmt_count; ulong prepared_stmt_count=0; ulong thread_id=1L,current_pid; ulong slow_launch_threads = 0; -uint sync_binlog_period= 0, sync_relaylog_period= 0; +uint sync_binlog_period= 0, sync_relaylog_period= 0, + sync_relayloginfo_period= 0, sync_masterinfo_period= 0; ulong expire_logs_days = 0; ulong rpl_recovery_rank=0; const char *log_output_str= "FILE"; @@ -5605,6 +5607,7 @@ enum options_mysqld OPT_QUERY_CACHE_TYPE, OPT_QUERY_CACHE_WLOCK_INVALIDATE, OPT_RECORD_BUFFER, OPT_RECORD_RND_BUFFER, OPT_DIV_PRECINCREMENT, OPT_RELAY_LOG_SPACE_LIMIT, OPT_RELAY_LOG_PURGE, + OPT_RELAY_LOG_RECOVERY, OPT_SLAVE_NET_TIMEOUT, OPT_SLAVE_COMPRESSED_PROTOCOL, OPT_SLOW_LAUNCH_TIME, OPT_SLAVE_TRANS_RETRIES, OPT_READONLY, OPT_DEBUGGING, OPT_SORT_BUFFER, OPT_TABLE_OPEN_CACHE, OPT_TABLE_DEF_CACHE, @@ -5669,7 +5672,9 @@ enum options_mysqld OPT_GENERAL_LOG_FILE, OPT_SLOW_QUERY_LOG_FILE, OPT_IGNORE_BUILTIN_INNODB, - OPT_SYNC_RELAY_LOG + OPT_SYNC_RELAY_LOG, + OPT_SYNC_RELAY_LOG_INFO, + OPT_SYNC_MASTER_INFO }; @@ -6889,6 +6894,13 @@ The minimum value for this variable is 4096.", (uchar**) &relay_log_purge, (uchar**) &relay_log_purge, 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0}, + {"relay_log_recovery", OPT_RELAY_LOG_RECOVERY, + "Enables automatic relay log recovery right after the database startup, " + "which means that the IO Thread starts re-fetching from the master " + "right after the last transaction processed.", + (uchar**) &relay_log_recovery, + (uchar**) &relay_log_recovery, 0, GET_BOOL, NO_ARG, + 0, 0, 1, 0, 1, 0}, {"relay_log_space_limit", OPT_RELAY_LOG_SPACE_LIMIT, "Maximum space to use for all relay logs.", (uchar**) &relay_log_space_limit, @@ -6930,6 +6942,16 @@ The minimum value for this variable is 4096.", "Use 0 (default) to disable synchronous flushing.", (uchar**) &sync_relaylog_period, (uchar**) &sync_relaylog_period, 0, GET_UINT, REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0}, + {"sync-relay-log-info", OPT_SYNC_RELAY_LOG_INFO, + "Synchronously flush relay log info to disk after #th transaction. " + "Use 0 (default) to disable synchronous flushing.", + (uchar**) &sync_relayloginfo_period, (uchar**) &sync_relayloginfo_period, 0, GET_UINT, + REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0}, + {"sync-master-info", OPT_SYNC_MASTER_INFO, + "Synchronously flush master info to disk after every #th event. " + "Use 0 (default) to disable synchronous flushing.", + (uchar**) &sync_masterinfo_period, (uchar**) &sync_masterinfo_period, 0, GET_UINT, + REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0}, {"sync-frm", OPT_SYNC_FRM, "Sync .frm to disk on create. Enabled by default.", (uchar**) &opt_sync_frm, (uchar**) &opt_sync_frm, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc index 1bca44ac613..cec2eabdd20 100644 --- a/sql/rpl_mi.cc +++ b/sql/rpl_mi.cc @@ -27,11 +27,11 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val); int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, const char *default_val); -Master_info::Master_info() +Master_info::Master_info(bool is_slave_recovery) :Slave_reporting_capability("I/O"), ssl(0), ssl_verify_server_cert(0), fd(-1), io_thd(0), inited(0), - abort_slave(0),slave_running(0), - slave_run_id(0) + rli(is_slave_recovery), abort_slave(0), slave_running(0), + slave_run_id(0), sync_counter(0) { host[0] = 0; user[0] = 0; password[0] = 0; ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0; @@ -364,11 +364,6 @@ int flush_master_info(Master_info* mi, bool flush_relay_log_cache) IO_CACHE *log_file= mi->rli.relay_log.get_log_file(); if (flush_io_cache(log_file)) DBUG_RETURN(2); - - /* Sync to disk if --sync-relay-log is set */ - if (sync_relaylog_period && - my_sync(log_file->file, MY_WME)) - DBUG_RETURN(2); } /* @@ -398,8 +393,12 @@ int flush_master_info(Master_info* mi, bool flush_relay_log_cache) (int)(mi->ssl), mi->ssl_ca, mi->ssl_capath, mi->ssl_cert, mi->ssl_cipher, mi->ssl_key, mi->ssl_verify_server_cert); err= flush_io_cache(file); - if (sync_relaylog_period && !err) + if (sync_masterinfo_period && !err && + ++(mi->sync_counter) >= sync_masterinfo_period) + { err= my_sync(mi->fd, MYF(MY_WME)); + mi->sync_counter= 0; + } DBUG_RETURN(-err); } diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h index 93fb0a98198..c59dffefb7c 100644 --- a/sql/rpl_mi.h +++ b/sql/rpl_mi.h @@ -58,7 +58,7 @@ class Master_info : public Slave_reporting_capability { public: - Master_info(); + Master_info(bool is_slave_recovery); ~Master_info(); /* the variables below are needed because we can change masters on the fly */ @@ -100,6 +100,13 @@ class Master_info : public Slave_reporting_capability */ long clock_diff_with_master; + + /* + Keeps track of the number of events before fsyncing. + The option --sync-master-info determines how many + events should happen before fsyncing. + */ + uint sync_counter; }; void init_master_info_with_options(Master_info* mi); diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 37c0815fb8b..3a12164a1cf 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -28,11 +28,11 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val); int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, const char *default_val); - -Relay_log_info::Relay_log_info() +Relay_log_info::Relay_log_info(bool is_slave_recovery) :Slave_reporting_capability("SQL"), no_storage(FALSE), replicate_same_server_id(::replicate_same_server_id), info_fd(-1), cur_log_fd(-1), relay_log(&sync_relaylog_period), + sync_counter(0), is_relay_log_recovery(is_slave_recovery), save_temporary_tables(0), #if HAVE_purify is_fake(FALSE), @@ -259,7 +259,8 @@ Failed to open the existing relay log info file '%s' (errno %d)", rli->group_relay_log_pos= rli->event_relay_log_pos= relay_log_pos; rli->group_master_log_pos= master_log_pos; - if (init_relay_log_pos(rli, + if (!rli->is_relay_log_recovery && + init_relay_log_pos(rli, rli->group_relay_log_name, rli->group_relay_log_pos, 0 /* no data lock*/, @@ -274,6 +275,7 @@ Failed to open the existing relay log info file '%s' (errno %d)", } #ifndef DBUG_OFF + if (!rli->is_relay_log_recovery) { char llbuf1[22], llbuf2[22]; DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s rli->event_relay_log_pos=%s", diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index 171778d9675..a5410dd0c79 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -96,6 +96,19 @@ public: LOG_INFO linfo; IO_CACHE cache_buf,*cur_log; + /* + Keeps track of the number of transactions that commits + before fsyncing. The option --sync-relay-log-info determines + how many transactions should commit before fsyncing. + */ + uint sync_counter; + + /* + Identifies when the recovery process is going on. + See sql/slave.cc:init_recovery for further details. + */ + bool is_relay_log_recovery; + /* The following variables are safe to read any time */ /* IO_CACHE of the info file - set only during init or end */ @@ -267,7 +280,7 @@ public: char slave_patternload_file[FN_REFLEN]; size_t slave_patternload_file_size; - Relay_log_info(); + Relay_log_info(bool is_slave_recovery); ~Relay_log_info(); /* diff --git a/sql/set_var.cc b/sql/set_var.cc index f2b5201cf8b..dcc3954ff1e 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -1534,19 +1534,19 @@ static bool get_unsigned(THD *thd, set_var *var, ulonglong user_max, } -bool sys_var_int_ptr::check(THD *thd, set_var *var) +bool sys_var_uint_ptr::check(THD *thd, set_var *var) { - var->save_result.ulong_value= (ulong) var->value->val_int(); + var->save_result.ulong_value= (ulong) var->value->val_uint(); return 0; } -bool sys_var_int_ptr::update(THD *thd, set_var *var) +bool sys_var_uint_ptr::update(THD *thd, set_var *var) { *value= (uint) var->save_result.ulong_value; return 0; } -void sys_var_int_ptr::set_default(THD *thd, enum_var_type type) +void sys_var_uint_ptr::set_default(THD *thd, enum_var_type type) { *value= (uint) option_limits->def_value; } diff --git a/sql/set_var.h b/sql/set_var.h index 02c87abed88..0202a15836d 100644 --- a/sql/set_var.h +++ b/sql/set_var.h @@ -178,10 +178,10 @@ public: /** Unsigned int system variable class */ -class sys_var_int_ptr :public sys_var +class sys_var_uint_ptr :public sys_var { public: - sys_var_int_ptr(sys_var_chain *chain, const char *name_arg, + sys_var_uint_ptr(sys_var_chain *chain, const char *name_arg, uint *value_ptr_arg, sys_after_update_func after_update_arg= NULL) :sys_var(name_arg, after_update_arg), diff --git a/sql/slave.cc b/sql/slave.cc index fac9ee214c5..5edb47df8b5 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -129,6 +129,7 @@ static bool wait_for_relay_log_space(Relay_log_info* rli); static inline bool io_slave_killed(THD* thd,Master_info* mi); static inline bool sql_slave_killed(THD* thd,Relay_log_info* rli); static int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type); +static int init_recovery(Master_info* mi); static void print_slave_skip_errors(void); static int safe_connect(THD* thd, MYSQL* mysql, Master_info* mi); static int safe_reconnect(THD* thd, MYSQL* mysql, Master_info* mi, @@ -220,6 +221,7 @@ void unlock_slave_threads(Master_info* mi) int init_slave() { DBUG_ENTER("init_slave"); + int error= 0; /* This is called when mysqld starts. Before client connections are @@ -231,7 +233,7 @@ int init_slave() TODO: re-write this to interate through the list of files for multi-master */ - active_mi= new Master_info; + active_mi= new Master_info(relay_log_recovery); /* If --slave-skip-errors=... was not used, the string value for the @@ -250,6 +252,7 @@ int init_slave() if (!active_mi) { sql_print_error("Failed to allocate memory for the master info structure"); + error= 1; goto err; } @@ -257,6 +260,13 @@ int init_slave() !master_host, (SLAVE_IO | SLAVE_SQL))) { sql_print_error("Failed to initialize the master info structure"); + error= 1; + goto err; + } + + if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi)) + { + error= 1; goto err; } @@ -275,18 +285,89 @@ int init_slave() SLAVE_IO | SLAVE_SQL)) { sql_print_error("Failed to create slave threads"); + error= 1; goto err; } } - pthread_mutex_unlock(&LOCK_active_mi); - DBUG_RETURN(0); err: + active_mi->rli.is_relay_log_recovery= FALSE; pthread_mutex_unlock(&LOCK_active_mi); - DBUG_RETURN(1); + DBUG_RETURN(error); } - +/* + Updates the master info based on the information stored in the + relay info and ignores relay logs previously retrieved by the IO + thread, which thus starts fetching again based on to the + group_master_log_pos and group_master_log_name. Eventually, the old + relay logs will be purged by the normal purge mechanism. + + In the feature, we should improve this routine in order to avoid throwing + away logs that are safely stored in the disk. Note also that this recovery + routine relies on the correctness of the relay-log.info and only tolerates + coordinate problems in master.info. + + In this function, there is no need for a mutex as the caller + (i.e. init_slave) already has one acquired. + + Specifically, the following structures are updated: + + 1 - mi->master_log_pos <-- rli->group_master_log_pos + 2 - mi->master_log_name <-- rli->group_master_log_name + 3 - It moves the relay log to the new relay log file, by + rli->group_relay_log_pos <-- BIN_LOG_HEADER_SIZE; + rli->event_relay_log_pos <-- BIN_LOG_HEADER_SIZE; + rli->group_relay_log_name <-- rli->relay_log.get_log_fname(); + rli->event_relay_log_name <-- rli->relay_log.get_log_fname(); + + If there is an error, it returns (1), otherwise returns (0). + */ +static int init_recovery(Master_info* mi) +{ + const char *errmsg= 0; + DBUG_ENTER("init_recovery"); + + Relay_log_info *rli= &mi->rli; + if (rli->group_master_log_name[0]) + { + mi->master_log_pos= max(BIN_LOG_HEADER_SIZE, + rli->group_master_log_pos); + strmake(mi->master_log_name, rli->group_master_log_name, + sizeof(mi->master_log_name)-1); + + sql_print_warning("Recovery from master pos %ld and file %s.", + (ulong) mi->master_log_pos, mi->master_log_name); + + strmake(rli->group_relay_log_name, rli->relay_log.get_log_fname(), + sizeof(rli->group_relay_log_name)-1); + strmake(rli->event_relay_log_name, rli->relay_log.get_log_fname(), + sizeof(mi->rli.event_relay_log_name)-1); + + rli->group_relay_log_pos= rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE; + + if (init_relay_log_pos(rli, + rli->group_relay_log_name, + rli->group_relay_log_pos, + 0 /*no data lock*/, + &errmsg, 0)) + DBUG_RETURN(1); + + if (flush_master_info(mi, 0)) + { + sql_print_error("Failed to flush master info file"); + DBUG_RETURN(1); + } + if (flush_relay_log_info(rli)) + { + sql_print_error("Failed to flush relay info file"); + DBUG_RETURN(1); + } + } + + DBUG_RETURN(0); +} + /** Convert slave skip errors bitmap into a printable string. */ @@ -3959,7 +4040,14 @@ bool flush_relay_log_info(Relay_log_info* rli) error=1; if (flush_io_cache(file)) error=1; - + if (sync_relayloginfo_period && + !error && + ++(rli->sync_counter) >= sync_relayloginfo_period) + { + if (my_sync(rli->info_fd, MYF(MY_WME))) + error=1; + rli->sync_counter= 0; + } /* Flushing the relay log is done by the slave I/O thread */ DBUG_RETURN(error); } @@ -4366,6 +4454,8 @@ void rotate_relay_log(Master_info* mi) DBUG_ENTER("rotate_relay_log"); Relay_log_info* rli= &mi->rli; + DBUG_EXECUTE_IF("crash_before_rotate_relaylog", abort();); + /* We don't lock rli->run_lock. This would lead to deadlocks. */ pthread_mutex_lock(&mi->run_lock); diff --git a/sql/sql_binlog.cc b/sql/sql_binlog.cc index 96e99b57e3c..531242f64d1 100644 --- a/sql/sql_binlog.cc +++ b/sql/sql_binlog.cc @@ -58,7 +58,7 @@ void mysql_client_binlog_statement(THD* thd) my_bool have_fd_event= TRUE; if (!thd->rli_fake) { - thd->rli_fake= new Relay_log_info; + thd->rli_fake= new Relay_log_info(FALSE); #ifdef HAVE_purify thd->rli_fake->is_fake= TRUE; #endif diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index 425d76c8b72..6295dbb0e79 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -1769,6 +1769,16 @@ static sys_var_const sys_relay_log_info_file(&vars, "relay_log_info_file", (uchar*) &relay_log_info_file); static sys_var_bool_ptr sys_relay_log_purge(&vars, "relay_log_purge", &relay_log_purge); +static sys_var_bool_ptr sys_relay_log_recovery(&vars, "relay_log_recovery", + &relay_log_recovery); +static sys_var_uint_ptr sys_sync_binlog_period(&vars, "sync_binlog", + &sync_binlog_period); +static sys_var_uint_ptr sys_sync_relaylog_period(&vars, "sync_relay_log", + &sync_relaylog_period); +static sys_var_uint_ptr sys_sync_relayloginfo_period(&vars, "sync_relay_log_info", + &sync_relayloginfo_period); +static sys_var_uint_ptr sys_sync_masterinfo_period(&vars, "sync_master_info", + &sync_masterinfo_period); static sys_var_const sys_relay_log_space_limit(&vars, "relay_log_space_limit", OPT_GLOBAL, SHOW_LONGLONG, @@ -1784,8 +1794,6 @@ static sys_var_const sys_slave_skip_errors(&vars, "slave_skip_errors", (uchar*) slave_skip_error_names); static sys_var_long_ptr sys_slave_trans_retries(&vars, "slave_transaction_retries", &slave_trans_retries); -static sys_var_int_ptr sys_sync_binlog_period(&vars, "sync_binlog", &sync_binlog_period); -static sys_var_int_ptr sys_sync_relaylog_period(&vars, "sync_relay_log", &sync_relaylog_period); static sys_var_slave_skip_counter sys_slave_skip_counter(&vars, "sql_slave_skip_counter"); |