summaryrefslogtreecommitdiff
path: root/storage/innobase/srv/srv0srv.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/srv/srv0srv.c')
-rw-r--r--storage/innobase/srv/srv0srv.c2598
1 files changed, 2598 insertions, 0 deletions
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
new file mode 100644
index 00000000000..41b0594d705
--- /dev/null
+++ b/storage/innobase/srv/srv0srv.c
@@ -0,0 +1,2598 @@
+/******************************************************
+The database server main program
+
+NOTE: SQL Server 7 uses something which the documentation
+calls user mode scheduled threads (UMS threads). One such
+thread is usually allocated per processor. Win32
+documentation does not know any UMS threads, which suggests
+that the concept is internal to SQL Server 7. It may mean that
+SQL Server 7 does all the scheduling of threads itself, even
+in i/o waits. We should maybe modify InnoDB to use the same
+technique, because thread switches within NT may be too slow.
+
+SQL Server 7 also mentions fibers, which are cooperatively
+scheduled threads. They can boost performance by 5 %,
+according to the Delaney and Soukup's book.
+
+Windows 2000 will have something called thread pooling
+(see msdn website), which we could possibly use.
+
+Another possibility could be to use some very fast user space
+thread library. This might confuse NT though.
+
+(c) 1995 Innobase Oy
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "thr0loc.h"
+#include "que0que.h"
+#include "srv0que.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "srv0start.h"
+#include "row0mysql.h"
+
+/* This is set to TRUE if the MySQL user has set it in MySQL; currently
+affects only FOREIGN KEY definition parsing */
+ibool srv_lower_case_table_names = FALSE;
+
+/* The following counter is incremented whenever there is some user activity
+in the server */
+ulint srv_activity_count = 0;
+
+/* The following is the maximum allowed duration of a lock wait. */
+ulint srv_fatal_semaphore_wait_threshold = 600;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+ulint srv_dml_needed_delay = 0;
+
+ibool srv_lock_timeout_and_monitor_active = FALSE;
+ibool srv_error_monitor_active = FALSE;
+
+const char* srv_main_thread_op_info = "";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+char* srv_data_home = NULL;
+#ifdef UNIV_LOG_ARCHIVE
+char* srv_arch_dir = NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ibool srv_file_per_table = FALSE; /* store to its own file each table
+ created by an user; data dictionary
+ tables are in the system tablespace
+ 0 */
+ibool srv_locks_unsafe_for_binlog = FALSE; /* Place locks to records only
+ i.e. do not use next-key locking
+ except on duplicate key checking and
+ foreign key checking */
+ulint srv_n_data_files = 0;
+char** srv_data_file_names = NULL;
+ulint* srv_data_file_sizes = NULL; /* size in database pages */
+
+ibool srv_auto_extend_last_data_file = FALSE; /* if TRUE, then we
+ auto-extend the last data
+ file */
+ulint srv_last_file_size_max = 0; /* if != 0, this tells
+ the max size auto-extending
+ may increase the last data
+ file size */
+ulong srv_auto_extend_increment = 8; /* If the last data file is
+ auto-extended, we add this
+ many pages to it at a time */
+ulint* srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+ibool srv_created_new_raw = FALSE;
+
+char** srv_log_group_home_dirs = NULL;
+
+ulint srv_n_log_groups = ULINT_MAX;
+ulint srv_n_log_files = ULINT_MAX;
+ulint srv_log_file_size = ULINT_MAX; /* size in database pages */
+ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */
+ulint srv_flush_log_at_trx_commit = 1;
+
+byte srv_latin1_ordering[256] /* The sort order table of the latin1
+ character set. The following table is
+ the MySQL order as of Feb 10th, 2002 */
+= {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27
+, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F
+, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37
+, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F
+, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47
+, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F
+, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57
+, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F
+, 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47
+, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F
+, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57
+, 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F
+, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F
+, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97
+, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F
+, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7
+, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF
+, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7
+, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF
+, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43
+, 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49
+, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xD7
+, 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xDF
+, 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43
+, 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49
+, 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7
+, 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
+};
+
+ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits
+ this to size in kilobytes but
+ we normalize this to pages in
+ srv_boot() */
+ulint srv_awe_window_size = 0; /* size in pages; MySQL inits
+ this to bytes, but we
+ normalize it to pages in
+ srv_boot() */
+ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */
+ulint srv_lock_table_size = ULINT_MAX;
+
+ulint srv_n_file_io_threads = ULINT_MAX;
+
+#ifdef UNIV_LOG_ARCHIVE
+ibool srv_log_archive_on = FALSE;
+ibool srv_archive_recovery = 0;
+dulint srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ulint srv_lock_wait_timeout = 1024 * 1024 * 1024;
+
+char* srv_file_flush_method_str = NULL;
+ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+ulint srv_max_n_open_files = 300;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+ulong srv_max_buf_pool_modified_pct = 90;
+
+/* variable counts amount of data read in total (in bytes) */
+ulint srv_data_read = 0;
+
+/* here we count the amount of data written in total (in bytes) */
+ulint srv_data_written = 0;
+
+/* the number of the log write requests done */
+ulint srv_log_write_requests = 0;
+
+/* the number of physical writes to the log performed */
+ulint srv_log_writes = 0;
+
+/* amount of data written to the log files in bytes */
+ulint srv_os_log_written = 0;
+
+/* amount of writes being done to the log files */
+ulint srv_os_log_pending_writes = 0;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+ulint srv_log_waits = 0;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+ulint srv_dblwr_writes = 0;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+ulint srv_dblwr_pages_written = 0;
+
+/* in this variable we store the number of write requests issued */
+ulint srv_buf_pool_write_requests = 0;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+ulint srv_buf_pool_wait_free = 0;
+
+/* variable to count the number of pages that were written from buffer
+pool to the disk */
+ulint srv_buf_pool_flushed = 0;
+
+/* variable to count the number of buffer pool reads that led to the
+reading of a disk page */
+ulint srv_buf_pool_reads = 0;
+
+/* variable to count the number of sequential read-aheads */
+ulint srv_read_ahead_seq = 0;
+
+/* variable to count the number of random read-aheads */
+ulint srv_read_ahead_rnd = 0;
+
+/* structure to pass status variables to MySQL */
+export_struc export_vars;
+
+/* If the following is != 0 we do not allow inserts etc. This protects
+the user from forgetting the innodb_force_recovery keyword to my.cnf */
+
+ulint srv_force_recovery = 0;
+/*-----------------------*/
+/* We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+ulint srv_max_n_threads = 0;
+
+/* The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. MySQL creates a thread for each user session, and
+semaphore contention and convoy problems can occur withput this restriction.
+Value 10 should be good if there are less than 4 processors + 4 disks in the
+computer. Bigger computers need bigger values. */
+
+ulong srv_thread_concurrency = 8;
+
+os_fast_mutex_t srv_conc_mutex; /* this mutex protects srv_conc data
+ structures */
+lint srv_conc_n_threads = 0; /* number of OS threads currently
+ inside InnoDB; it is not an error
+ if this drops temporarily below zero
+ because we do not demand that every
+ thread increments this, but a thread
+ waiting for a lock decrements this
+ temporarily */
+ulint srv_conc_n_waiting_threads = 0; /* number of OS threads waiting in the
+ FIFO for a permission to enter InnoDB
+ */
+
+typedef struct srv_conc_slot_struct srv_conc_slot_t;
+struct srv_conc_slot_struct{
+ os_event_t event; /* event to wait */
+ ibool reserved; /* TRUE if slot
+ reserved */
+ ibool wait_ended; /* TRUE when another
+ thread has already set
+ the event and the
+ thread in this slot is
+ free to proceed; but
+ reserved may still be
+ TRUE at that point */
+ UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue node */
+};
+
+UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue of threads
+ waiting to get in */
+srv_conc_slot_t* srv_conc_slots; /* array of wait
+ slots */
+
+/* Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket at srv_conc_enter_innodb */
+#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
+#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
+/*-----------------------*/
+/* If the following is set TRUE then we do not run purge and insert buffer
+merge to completion before shutdown */
+
+ibool srv_fast_shutdown = FALSE;
+
+ibool srv_very_fast_shutdown = FALSE; /* if this TRUE, do not flush the
+ buffer pool to data files at the
+ shutdown; we effectively 'crash'
+ InnoDB */
+/* Generate a innodb_status.<pid> file */
+ibool srv_innodb_status = FALSE;
+
+ibool srv_use_doublewrite_buf = TRUE;
+ibool srv_use_checksums = TRUE;
+
+ibool srv_set_thread_priorities = TRUE;
+int srv_query_thread_priority = 0;
+
+/* TRUE if the Address Windowing Extensions of Windows are used; then we must
+disable adaptive hash indexes */
+ibool srv_use_awe = FALSE;
+ibool srv_use_adaptive_hash_indexes = TRUE;
+
+/*-------------------------------------------*/
+ulong srv_n_spin_wait_rounds = 20;
+ulong srv_n_free_tickets_to_enter = 500;
+ulong srv_thread_sleep_delay = 10000;
+ulint srv_spin_wait_delay = 5;
+ibool srv_priority_boost = TRUE;
+
+ibool srv_print_thread_releases = FALSE;
+ibool srv_print_lock_waits = FALSE;
+ibool srv_print_buf_io = FALSE;
+ibool srv_print_log_io = FALSE;
+ibool srv_print_latch_waits = FALSE;
+
+ulint srv_n_rows_inserted = 0;
+ulint srv_n_rows_updated = 0;
+ulint srv_n_rows_deleted = 0;
+ulint srv_n_rows_read = 0;
+static ulint srv_n_rows_inserted_old = 0;
+static ulint srv_n_rows_updated_old = 0;
+static ulint srv_n_rows_deleted_old = 0;
+static ulint srv_n_rows_read_old = 0;
+
+ulint srv_n_lock_wait_count = 0;
+ulint srv_n_lock_wait_current_count = 0;
+ib_longlong srv_n_lock_wait_time = 0;
+ulint srv_n_lock_max_wait_time = 0;
+
+
+/*
+ Set the following to 0 if you want InnoDB to write messages on
+ stderr on startup/shutdown
+*/
+ibool srv_print_verbose_log = TRUE;
+ibool srv_print_innodb_monitor = FALSE;
+ibool srv_print_innodb_lock_monitor = FALSE;
+ibool srv_print_innodb_tablespace_monitor = FALSE;
+ibool srv_print_innodb_table_monitor = FALSE;
+
+/* The parameters below are obsolete: */
+
+ibool srv_print_parsed_sql = FALSE;
+
+ulint srv_sim_disk_wait_pct = ULINT_MAX;
+ulint srv_sim_disk_wait_len = ULINT_MAX;
+ibool srv_sim_disk_wait_by_yield = FALSE;
+ibool srv_sim_disk_wait_by_wait = FALSE;
+
+ibool srv_measure_contention = FALSE;
+ibool srv_measure_by_spin = FALSE;
+
+ibool srv_test_extra_mutexes = FALSE;
+ibool srv_test_nocache = FALSE;
+ibool srv_test_cache_evict = FALSE;
+
+ibool srv_test_sync = FALSE;
+ulint srv_test_n_threads = ULINT_MAX;
+ulint srv_test_n_loops = ULINT_MAX;
+ulint srv_test_n_free_rnds = ULINT_MAX;
+ulint srv_test_n_reserved_rnds = ULINT_MAX;
+ulint srv_test_array_size = ULINT_MAX;
+ulint srv_test_n_mutexes = ULINT_MAX;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+time_t srv_last_monitor_time;
+
+mutex_t srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file */
+mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+FILE* srv_monitor_file;
+
+ulint srv_main_thread_process_no = 0;
+ulint srv_main_thread_id = 0;
+
+/*
+ IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+ =========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept equivalent OS concept
+---------- ---------------------
+transaction -- process;
+
+query thread -- thread;
+
+lock -- semaphore;
+
+transaction set to
+the rollback state -- kill signal delivered to a process;
+
+kernel -- kernel;
+
+query thread execution:
+(a) without kernel mutex
+reserved -- process executing in user mode;
+(b) with kernel mutex reserved
+ -- process executing in kernel mode;
+
+The server is controlled by a master thread which runs at
+a priority higher than normal, that is, higher than user threads.
+It sleeps most of the time, and wakes up, say, every 300 milliseconds,
+to check whether there is anything happening in the server which
+requires intervention of the master thread. Such situations may be,
+for example, when flushing of dirty blocks is needed in the buffer
+pool or old version of database rows have to be cleaned away.
+
+The threads which we call user threads serve the queries of
+the clients and input from the console of the server.
+They run at normal priority. The server may have several
+communications endpoints. A dedicated set of user threads waits
+at each of these endpoints ready to receive a client request.
+Each request is taken by a single user thread, which then starts
+processing and, when the result is ready, sends it to the client
+and returns to wait at the same endpoint the thread started from.
+
+So, we do not have dedicated communication threads listening at
+the endpoints and dealing the jobs to dedicated worker threads.
+Our architecture saves one thread swithch per request, compared
+to the solution with dedicated communication threads
+which amounts to 15 microseconds on 100 MHz Pentium
+running NT. If the client
+is communicating over a network, this saving is negligible, but
+if the client resides in the same machine, maybe in an SMP machine
+on a different processor from the server thread, the saving
+can be important as the threads can communicate over shared
+memory with an overhead of a few microseconds.
+
+We may later implement a dedicated communication thread solution
+for those endpoints which communicate over a network.
+
+Our solution with user threads has two problems: for each endpoint
+there has to be a number of listening threads. If there are many
+communication endpoints, it may be difficult to set the right number
+of concurrent threads in the system, as many of the threads
+may always be waiting at less busy endpoints. Another problem
+is queuing of the messages, as the server internally does not
+offer any queue for jobs.
+
+Another group of user threads is intended for splitting the
+queries and processing them in parallel. Let us call these
+parallel communication threads. These threads are waiting for
+parallelized tasks, suspended on event semaphores.
+
+A single user thread waits for input from the console,
+like a command to shut the database.
+
+Utility threads are a different group of threads which takes
+care of the buffer pool flushing and other, mainly background
+operations, in the server.
+Some of these utility threads always run at a lower than normal
+priority, so that they are always in background. Some of them
+may dynamically boost their priority by the pri_adjust function,
+even to higher than normal priority, if their task becomes urgent.
+The running of utilities is controlled by high- and low-water marks
+of urgency. The urgency may be measured by the number of dirty blocks
+in the buffer pool, in the case of the flush thread, for example.
+When the high-water mark is exceeded, an utility starts running, until
+the urgency drops under the low-water mark. Then the utility thread
+suspend itself to wait for an event. The master thread is
+responsible of signaling this event when the utility thread is
+again needed.
+
+For each individual type of utility, some threads always remain
+at lower than normal priority. This is because pri_adjust is implemented
+so that the threads at normal or higher priority control their
+share of running time by calling sleep. Thus, if the load of the
+system sudenly drops, these threads cannot necessarily utilize
+the system fully. The background priority threads make up for this,
+starting to run when the load drops.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making
+the server totally silent. The responsibility to signal this
+event is on the user thread which again receives a message
+from a client.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to
+keep record of which thread owns which resource and
+in the above case boost the priority of the background thread
+so that it will be scheduled and it can release the resource.
+This solution is called priority inheritance in real-time programming.
+A drawback of this solution is that the overhead of acquiring a mutex
+increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
+the thread has to call os_thread_get_curr_id.
+This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
+pair. Note that the thread
+cannot store the information in the resource, say mutex, itself,
+because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards,
+the information is outdated for the time of one machine instruction,
+at least. (To be precise, the information could be stored to
+lock_word in mutex if the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, but at the moment we plan to implement a more coarse solution,
+which could be called a global priority inheritance. If a thread
+has to wait for a long time, say 300 milliseconds, for a resource,
+we just guess that it may be waiting for a resource owned by a background
+thread, and boost the the priority of all runnable background threads
+to the normal level. The background threads then themselves adjust
+their fixed priority back to background after releasing all resources
+they had (or, at some fixed points in their program code).
+
+What is the performance of the global priority inheritance solution?
+We may weigh the length of the wait time 300 milliseconds, during
+which the system processes some other thread
+to the cost of boosting the priority of each runnable background
+thread, rescheduling it, and lowering the priority again.
+On 100 MHz Pentium + NT this overhead may be of the order 100
+microseconds per thread. So, if the number of runnable background
+threads is not very big, say < 100, the cost is tolerable.
+Utility threads probably will access resources used by
+user threads not very often, so collisions of user threads
+to preempted utility threads should not happen very often.
+
+The thread table contains
+information of the current status of each thread existing in the system,
+and also the event semaphores used in suspending the master thread
+and utility and parallel communication threads when they have nothing to do.
+The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation.
+
+The thread table is also used in the global priority inheritance
+scheme. This brings in one additional complication: threads accessing
+the thread table must have at least normal fixed priority,
+because the priority inheritance solution does not work if a background
+thread is preempted while possessing the mutex protecting the thread table.
+So, if a thread accesses the thread table, its priority has to be
+boosted at least to normal. This priority requirement can be seen similar to
+the privileged mode used when processing the kernel calls in traditional
+Unix.*/
+
+/* Thread slot in the thread table */
+struct srv_slot_struct{
+ os_thread_id_t id; /* thread id */
+ os_thread_t handle; /* thread handle */
+ ulint type; /* thread type: user, utility etc. */
+ ibool in_use; /* TRUE if this slot is in use */
+ ibool suspended; /* TRUE if the thread is waiting
+ for the event of this slot */
+ ib_time_t suspend_time; /* time when the thread was
+ suspended */
+ os_event_t event; /* event used in suspending the
+ thread when it has nothing to do */
+ que_thr_t* thr; /* suspended query thread (only
+ used for MySQL threads) */
+};
+
+/* Table for MySQL threads where they will be suspended to wait for locks */
+srv_slot_t* srv_mysql_table = NULL;
+
+os_event_t srv_lock_timeout_thread_event;
+
+srv_sys_t* srv_sys = NULL;
+
+byte srv_pad1[64]; /* padding to prevent other memory update
+ hotspots from residing on the same memory
+ cache line */
+mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs,
+ query threads, and lock table */
+byte srv_pad2[64]; /* padding to prevent other memory update
+ hotspots from residing on the same memory
+ cache line */
+
+/* The following three values measure the urgency of the jobs of
+buffer, version, and insert threads. They may vary from 0 - 1000.
+The server mutex protects all these variables. The low-water values
+tell that the server can acquiesce the utility when the value
+drops below this low-water mark. */
+
+ulint srv_meter[SRV_MASTER + 1];
+ulint srv_meter_low_water[SRV_MASTER + 1];
+ulint srv_meter_high_water[SRV_MASTER + 1];
+ulint srv_meter_high_water2[SRV_MASTER + 1];
+ulint srv_meter_foreground[SRV_MASTER + 1];
+
+/* The following values give info about the activity going on in
+the database. They are protected by the server mutex. The arrays
+are indexed by the type of the thread. */
+
+ulint srv_n_threads_active[SRV_MASTER + 1];
+ulint srv_n_threads[SRV_MASTER + 1];
+
+/*************************************************************************
+Sets the info describing an i/o thread current state. */
+
+void
+srv_set_io_thread_op_info(
+/*======================*/
+ ulint i, /* in: the 'segment' of the i/o thread */
+ const char* str) /* in: constant char string describing the
+ state */
+{
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+
+ srv_io_thread_op_info[i] = str;
+}
+
+/*************************************************************************
+Accessor function to get pointer to n'th slot in the server thread
+table. */
+static
+srv_slot_t*
+srv_table_get_nth_slot(
+/*===================*/
+ /* out: pointer to the slot */
+ ulint index) /* in: index of the slot */
+{
+ ut_a(index < OS_THREAD_MAX_N);
+
+ return(srv_sys->threads + index);
+}
+
+/*************************************************************************
+Gets the number of threads in the system. */
+
+ulint
+srv_get_n_threads(void)
+/*===================*/
+{
+ ulint i;
+ ulint n_threads = 0;
+
+ mutex_enter(&kernel_mutex);
+
+ for (i = SRV_COM; i < SRV_MASTER + 1; i++) {
+
+ n_threads += srv_n_threads[i];
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(n_threads);
+}
+
+/*************************************************************************
+Reserves a slot in the thread table for the current thread. Also creates the
+thread local storage struct for the current thread. NOTE! The server mutex
+has to be reserved by the caller! */
+static
+ulint
+srv_table_reserve_slot(
+/*===================*/
+ /* out: reserved slot index */
+ ulint type) /* in: type of the thread: one of SRV_COM, ... */
+{
+ srv_slot_t* slot;
+ ulint i;
+
+ ut_a(type > 0);
+ ut_a(type <= SRV_MASTER);
+
+ i = 0;
+ slot = srv_table_get_nth_slot(i);
+
+ while (slot->in_use) {
+ i++;
+ slot = srv_table_get_nth_slot(i);
+ }
+
+ ut_a(slot->in_use == FALSE);
+
+ slot->in_use = TRUE;
+ slot->suspended = FALSE;
+ slot->id = os_thread_get_curr_id();
+ slot->handle = os_thread_get_curr();
+ slot->type = type;
+
+ thr_local_create();
+
+ thr_local_set_slot_no(os_thread_get_curr_id(), i);
+
+ return(i);
+}
+
+/*************************************************************************
+Suspends the calling thread to wait for the event in its thread slot.
+NOTE! The server mutex has to be reserved by the caller! */
+static
+os_event_t
+srv_suspend_thread(void)
+/*====================*/
+ /* out: event for the calling thread to wait */
+{
+ srv_slot_t* slot;
+ os_event_t event;
+ ulint slot_no;
+ ulint type;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Suspending thread %lu to slot %lu meter %lu\n",
+ (ulong) os_thread_get_curr_id(), (ulong) slot_no,
+ (ulong) srv_meter[SRV_RECOVERY]);
+ }
+
+ slot = srv_table_get_nth_slot(slot_no);
+
+ type = slot->type;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+
+ event = slot->event;
+
+ slot->suspended = TRUE;
+
+ ut_ad(srv_n_threads_active[type] > 0);
+
+ srv_n_threads_active[type]--;
+
+ os_event_reset(event);
+
+ return(event);
+}
+
+/*************************************************************************
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller! */
+
+ulint
+srv_release_threads(
+/*================*/
+ /* out: number of threads released: this may be
+ < n if not enough threads were suspended at the
+ moment */
+ ulint type, /* in: thread type */
+ ulint n) /* in: number of threads to release */
+{
+ srv_slot_t* slot;
+ ulint i;
+ ulint count = 0;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+ ut_ad(n > 0);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_table_get_nth_slot(i);
+
+ if (slot->in_use && slot->type == type && slot->suspended) {
+
+ slot->suspended = FALSE;
+
+ srv_n_threads_active[type]++;
+
+ os_event_set(slot->event);
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Releasing thread %lu type %lu from slot %lu meter %lu\n",
+ (ulong) slot->id, (ulong) type, (ulong) i,
+ (ulong) srv_meter[SRV_RECOVERY]);
+ }
+
+ count++;
+
+ if (count == n) {
+ break;
+ }
+ }
+ }
+
+ return(count);
+}
+
+/*************************************************************************
+Returns the calling thread type. */
+
+ulint
+srv_get_thread_type(void)
+/*=====================*/
+ /* out: SRV_COM, ... */
+{
+ ulint slot_no;
+ srv_slot_t* slot;
+ ulint type;
+
+ mutex_enter(&kernel_mutex);
+
+ slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+ slot = srv_table_get_nth_slot(slot_no);
+
+ type = slot->type;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+
+ mutex_exit(&kernel_mutex);
+
+ return(type);
+}
+
+/*************************************************************************
+Initializes the server. */
+
+void
+srv_init(void)
+/*==========*/
+{
+ srv_conc_slot_t* conc_slot;
+ srv_slot_t* slot;
+ dict_table_t* table;
+ ulint i;
+
+ srv_sys = mem_alloc(sizeof(srv_sys_t));
+
+ kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
+ mutex_create(&kernel_mutex);
+ mutex_set_level(&kernel_mutex, SYNC_KERNEL);
+
+ mutex_create(&srv_innodb_monitor_mutex);
+ mutex_set_level(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+ srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_table_get_nth_slot(i);
+ slot->in_use = FALSE;
+ slot->type=0; /* Avoid purify errors */
+ slot->event = os_event_create(NULL);
+ ut_a(slot->event);
+ }
+
+ srv_mysql_table = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_mysql_table + i;
+ slot->in_use = FALSE;
+ slot->type = 0;
+ slot->event = os_event_create(NULL);
+ ut_a(slot->event);
+ }
+
+ srv_lock_timeout_thread_event = os_event_create(NULL);
+
+ for (i = 0; i < SRV_MASTER + 1; i++) {
+ srv_n_threads_active[i] = 0;
+ srv_n_threads[i] = 0;
+ srv_meter[i] = 30;
+ srv_meter_low_water[i] = 50;
+ srv_meter_high_water[i] = 100;
+ srv_meter_high_water2[i] = 200;
+ srv_meter_foreground[i] = 250;
+ }
+
+ srv_sys->operational = os_event_create(NULL);
+
+ ut_a(srv_sys->operational);
+
+ UT_LIST_INIT(srv_sys->tasks);
+
+ /* create dummy table and index for old-style infimum and supremum */
+ table = dict_mem_table_create("SYS_DUMMY1",
+ DICT_HDR_SPACE, 1, FALSE);
+ dict_mem_table_add_col(table, "DUMMY", DATA_CHAR,
+ DATA_ENGLISH | DATA_NOT_NULL, 8, 0);
+
+ srv_sys->dummy_ind1 = dict_mem_index_create("SYS_DUMMY1",
+ "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1);
+ dict_index_add_col(srv_sys->dummy_ind1,
+ dict_table_get_nth_col(table, 0), 0, 0);
+ srv_sys->dummy_ind1->table = table;
+ /* create dummy table and index for new-style infimum and supremum */
+ table = dict_mem_table_create("SYS_DUMMY2",
+ DICT_HDR_SPACE, 1, TRUE);
+ dict_mem_table_add_col(table, "DUMMY", DATA_CHAR,
+ DATA_ENGLISH | DATA_NOT_NULL, 8, 0);
+ srv_sys->dummy_ind2 = dict_mem_index_create("SYS_DUMMY2",
+ "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1);
+ dict_index_add_col(srv_sys->dummy_ind2,
+ dict_table_get_nth_col(table, 0), 0, 0);
+ srv_sys->dummy_ind2->table = table;
+
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE;
+
+ /* Init the server concurrency restriction data structures */
+
+ os_fast_mutex_init(&srv_conc_mutex);
+
+ UT_LIST_INIT(srv_conc_queue);
+
+ srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ conc_slot = srv_conc_slots + i;
+ conc_slot->reserved = FALSE;
+ conc_slot->event = os_event_create(NULL);
+ ut_a(conc_slot->event);
+ }
+}
+
+/*************************************************************************
+Frees the OS fast mutex created in srv_init(). */
+
+void
+srv_free(void)
+/*==========*/
+{
+ os_fast_mutex_free(&srv_conc_mutex);
+}
+
+/*************************************************************************
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+
+void
+srv_general_init(void)
+/*==================*/
+{
+ os_sync_init();
+ sync_init();
+ mem_init(srv_mem_pool_size);
+ thr_local_init();
+}
+
+/*======================= InnoDB Server FIFO queue =======================*/
+
+/* Maximum allowable purge history length. <=0 means 'infinite'. */
+ulong srv_max_purge_lag = 0;
+
+/*************************************************************************
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+
+void
+srv_conc_enter_innodb(
+/*==================*/
+ trx_t* trx) /* in: transaction object associated with the
+ thread */
+{
+ ibool has_slept = FALSE;
+ srv_conc_slot_t* slot = NULL;
+ ulint i;
+
+ if (srv_thread_concurrency >= 500) {
+ /* Disable the concurrency check */
+
+ return;
+ }
+
+ /* If trx has 'free tickets' to enter the engine left, then use one
+ such ticket */
+
+ if (trx->n_tickets_to_enter_innodb > 0) {
+ trx->n_tickets_to_enter_innodb--;
+
+ return;
+ }
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(
+" InnoDB: Error: trying to declare trx to enter InnoDB, but\n"
+"InnoDB: it already is declared.\n", stderr);
+ trx_print(stderr, trx);
+ putc('\n', stderr);
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ /* If the transaction is not holding resources,
+ let it sleep for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
+
+ if (!has_slept && !trx->has_search_latch
+ && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
+
+ has_slept = TRUE; /* We let is sleep only once to avoid
+ starvation */
+
+ srv_conc_n_waiting_threads++;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ trx->op_info = "sleeping before joining InnoDB queue";
+
+ /* Peter Zaitsev suggested that we take the sleep away
+ altogether. But the sleep may be good in pathological
+ situations of lots of thread switches. Simply put some
+ threads aside for a while to reduce the number of thread
+ switches. */
+ if (SRV_THREAD_SLEEP_DELAY > 0)
+ {
+ os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
+ }
+
+ trx->op_info = "";
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_waiting_threads--;
+
+ goto retry;
+ }
+
+ /* Too many threads inside: put the current thread to a queue */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_conc_slots + i;
+
+ if (!slot->reserved) {
+
+ break;
+ }
+ }
+
+ if (i == OS_THREAD_MAX_N) {
+ /* Could not find a free wait slot, we must let the
+ thread enter */
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ /* Release possible search system latch this thread has */
+ if (trx->has_search_latch) {
+ trx_search_latch_release_if_reserved(trx);
+ }
+
+ /* Add to the queue */
+ slot->reserved = TRUE;
+ slot->wait_ended = FALSE;
+
+ UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+ os_event_reset(slot->event);
+
+ srv_conc_n_waiting_threads++;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ /* Go to wait for the event; when a thread leaves InnoDB it will
+ release this thread */
+
+ trx->op_info = "waiting in InnoDB queue";
+
+ os_event_wait(slot->event);
+
+ trx->op_info = "";
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_waiting_threads--;
+
+ /* NOTE that the thread which released this thread already
+ incremented the thread counter on behalf of this thread */
+
+ slot->reserved = FALSE;
+
+ UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*************************************************************************
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+ trx_t* trx) /* in: transaction object associated with the
+ thread */
+{
+ if (srv_thread_concurrency >= 500) {
+
+ return;
+ }
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*************************************************************************
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+ trx_t* trx) /* in: transaction object associated with the
+ thread */
+{
+ srv_conc_slot_t* slot = NULL;
+
+ if (srv_thread_concurrency >= 500) {
+
+ return;
+ }
+
+ if (trx->declared_to_be_inside_innodb == FALSE) {
+
+ return;
+ }
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_threads--;
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+ /* Look for a slot where a thread is waiting and no other
+ thread has yet released the thread */
+
+ slot = UT_LIST_GET_FIRST(srv_conc_queue);
+
+ while (slot && slot->wait_ended == TRUE) {
+ slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
+ }
+
+ if (slot != NULL) {
+ slot->wait_ended = TRUE;
+
+ /* We increment the count on behalf of the released
+ thread */
+
+ srv_conc_n_threads++;
+ }
+ }
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ if (slot != NULL) {
+ os_event_set(slot->event);
+ }
+}
+
+/*************************************************************************
+This must be called when a thread exits InnoDB. */
+
+void
+srv_conc_exit_innodb(
+/*=================*/
+ trx_t* trx) /* in: transaction object associated with the
+ thread */
+{
+ if (srv_thread_concurrency >= 500) {
+
+ return;
+ }
+
+ if (trx->n_tickets_to_enter_innodb > 0) {
+ /* We will pretend the thread is still inside InnoDB though it
+ now leaves the InnoDB engine. In this way we save
+ a lot of semaphore operations. srv_conc_force_exit_innodb is
+ used to declare the thread definitely outside InnoDB. It
+ should be called when there is a lock wait or an SQL statement
+ ends. */
+
+ return;
+ }
+
+ srv_conc_force_exit_innodb(trx);
+}
+
+/*========================================================================*/
+
+/*************************************************************************
+Normalizes init parameter values to use units we use inside InnoDB. */
+static
+ulint
+srv_normalize_init_values(void)
+/*===========================*/
+ /* out: DB_SUCCESS or error code */
+{
+ ulint n;
+ ulint i;
+
+ n = srv_n_data_files;
+
+ for (i = 0; i < n; i++) {
+ srv_data_file_sizes[i] = srv_data_file_sizes[i]
+ * ((1024 * 1024) / UNIV_PAGE_SIZE);
+ }
+
+ srv_last_file_size_max = srv_last_file_size_max
+ * ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+ srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+ srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+ srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024);
+
+ srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE;
+
+ if (srv_use_awe) {
+ /* If we are using AWE we must save memory in the 32-bit
+ address space of the process, and cannot bind the lock
+ table size to the real buffer pool size. */
+
+ srv_lock_table_size = 20 * srv_awe_window_size;
+ } else {
+ srv_lock_table_size = 5 * srv_pool_size;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************************
+Boots the InnoDB server. */
+
+ulint
+srv_boot(void)
+/*==========*/
+ /* out: DB_SUCCESS or error code */
+{
+ ulint err;
+
+ /* Transform the init parameter values given by MySQL to
+ use units we use inside InnoDB: */
+
+ err = srv_normalize_init_values();
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Initialize synchronization primitives, memory management, and thread
+ local storage */
+
+ srv_general_init();
+
+ /* Initialize this module */
+
+ srv_init();
+
+ return(DB_SUCCESS);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************************
+Reserves a slot in the thread table for the current MySQL OS thread.
+NOTE! The kernel mutex has to be reserved by the caller! */
+static
+srv_slot_t*
+srv_table_reserve_slot_for_mysql(void)
+/*==================================*/
+ /* out: reserved slot */
+{
+ srv_slot_t* slot;
+ ulint i;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ i = 0;
+ slot = srv_mysql_table + i;
+
+ while (slot->in_use) {
+ i++;
+
+ if (i >= OS_THREAD_MAX_N) {
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" InnoDB: There appear to be %lu MySQL threads currently waiting\n"
+"InnoDB: inside InnoDB, which is the upper limit. Cannot continue operation.\n"
+"InnoDB: We intentionally generate a seg fault to print a stack trace\n"
+"InnoDB: on Linux. But first we print a list of waiting threads.\n", (ulong) i);
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ fprintf(stderr,
+"Slot %lu: thread id %lu, type %lu, in use %lu, susp %lu, time %lu\n",
+ (ulong) i, (ulong) os_thread_pf(slot->id),
+ (ulong) slot->type, (ulong) slot->in_use,
+ (ulong) slot->suspended,
+ (ulong) difftime(ut_time(), slot->suspend_time));
+ }
+
+ ut_error;
+ }
+
+ slot = srv_mysql_table + i;
+ }
+
+ ut_a(slot->in_use == FALSE);
+
+ slot->in_use = TRUE;
+ slot->id = os_thread_get_curr_id();
+ slot->handle = os_thread_get_curr();
+
+ return(slot);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+ que_thr_t* thr) /* in: query thread associated with the MySQL
+ OS thread */
+{
+#ifndef UNIV_HOTBACKUP
+ srv_slot_t* slot;
+ os_event_t event;
+ double wait_time;
+ trx_t* trx;
+ ibool had_dict_lock = FALSE;
+ ibool was_declared_inside_innodb = FALSE;
+ ib_longlong start_time = 0;
+ ib_longlong finish_time;
+ ulint diff_time;
+ ulint sec;
+ ulint ms;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ trx = thr_get_trx(thr);
+
+ os_event_set(srv_lock_timeout_thread_event);
+
+ mutex_enter(&kernel_mutex);
+
+ trx->error_state = DB_SUCCESS;
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ ut_ad(thr->is_active == TRUE);
+
+ /* The lock has already been released or this transaction
+ was chosen as a deadlock victim: no need to suspend */
+
+ if (trx->was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ ut_ad(thr->is_active == FALSE);
+
+ slot = srv_table_reserve_slot_for_mysql();
+
+ event = slot->event;
+
+ slot->thr = thr;
+
+ os_event_reset(event);
+
+ slot->suspend_time = ut_time();
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ srv_n_lock_wait_count++;
+ srv_n_lock_wait_current_count++;
+
+ ut_usectime(&sec, &ms);
+ start_time = (ib_longlong)sec * 1000000 + ms;
+ }
+ /* Wake the lock timeout monitor thread, if it is suspended */
+
+ os_event_set(srv_lock_timeout_thread_event);
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->declared_to_be_inside_innodb) {
+
+ was_declared_inside_innodb = TRUE;
+
+ /* We must declare this OS thread to exit InnoDB, since a
+ possible other thread holding a lock which this thread waits
+ for must be allowed to enter, sooner or later */
+
+ srv_conc_force_exit_innodb(trx);
+ }
+
+ /* Release possible foreign key check latch */
+ if (trx->dict_operation_lock_mode == RW_S_LATCH) {
+
+ had_dict_lock = TRUE;
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ /* Wait for the release */
+
+ os_event_wait(event);
+
+ if (had_dict_lock) {
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ if (was_declared_inside_innodb) {
+
+ /* Return back inside InnoDB */
+
+ srv_conc_force_enter_innodb(trx);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ /* Release the slot for others to use */
+
+ slot->in_use = FALSE;
+
+ wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_longlong)sec * 1000000 + ms;
+
+ diff_time = (ulint) (finish_time - start_time);
+
+ srv_n_lock_wait_current_count--;
+ srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
+ if (diff_time > srv_n_lock_max_wait_time) {
+ srv_n_lock_max_wait_time = diff_time;
+ }
+ }
+
+ if (trx->was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_lock_wait_timeout < 100000000 &&
+ wait_time > (double)srv_lock_wait_timeout) {
+
+ trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+ }
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+/************************************************************************
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr) /* in: query thread associated with the
+ MySQL OS thread */
+{
+#ifndef UNIV_HOTBACKUP
+ srv_slot_t* slot;
+ ulint i;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ if (slot->in_use && slot->thr == thr) {
+ /* Found */
+
+ os_event_set(slot->event);
+
+ return;
+ }
+ }
+
+ /* not found */
+#else /* UNIV_HOTBACKUP */
+ /* This function depends on MySQL code that is not included in
+ InnoDB Hot Backup builds. Besides, this function should never
+ be called in InnoDB Hot Backup. */
+ ut_error;
+#endif /* UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ srv_last_monitor_time = time(NULL);
+
+ os_aio_refresh_stats();
+
+ btr_cur_n_sea_old = btr_cur_n_sea;
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ log_refresh_stats();
+
+ buf_refresh_io_stats();
+
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+ srv_n_rows_read_old = srv_n_rows_read;
+
+ mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/**********************************************************************
+Outputs to a file the output of the InnoDB Monitor. */
+
+void
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file) /* in: output stream */
+{
+ double time_elapsed;
+ time_t current_time;
+ ulint n_reserved;
+
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+
+ /* We add 0.001 seconds to time_elapsed to prevent division
+ by zero if two users happen to call SHOW INNODB STATUS at the same
+ time */
+
+ time_elapsed = difftime(current_time, srv_last_monitor_time)
+ + 0.001;
+
+ srv_last_monitor_time = time(NULL);
+
+ fputs("\n=====================================\n", file);
+
+ ut_print_timestamp(file);
+ fprintf(file,
+ " INNODB MONITOR OUTPUT\n"
+ "=====================================\n"
+ "Per second averages calculated from the last %lu seconds\n",
+ (ulong)time_elapsed);
+
+ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
+ sync_print(file);
+
+ /* Conceptually, srv_innodb_monitor_mutex has a very high latching
+ order level in sync0sync.h, while dict_foreign_err_mutex has a very
+ low level 135. Therefore we can reserve the latter mutex here without
+ a danger of a deadlock of threads. */
+
+ mutex_enter(&dict_foreign_err_mutex);
+
+ if (ftell(dict_foreign_err_file) != 0L) {
+ fputs("------------------------\n"
+ "LATEST FOREIGN KEY ERROR\n"
+ "------------------------\n", file);
+ ut_copy_file(file, dict_foreign_err_file);
+ }
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ lock_print_info(file);
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+ os_aio_print(file);
+
+ fputs("-------------------------------------\n"
+ "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+ "-------------------------------------\n", file);
+ ibuf_print(file);
+
+ ha_print_info(file, btr_search_sys->hash_index);
+
+ fprintf(file,
+ "%.2f hash searches/s, %.2f non-hash searches/s\n",
+ (btr_cur_n_sea - btr_cur_n_sea_old)
+ / time_elapsed,
+ (btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+ / time_elapsed);
+ btr_cur_n_sea_old = btr_cur_n_sea;
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ fputs("---\n"
+ "LOG\n"
+ "---\n", file);
+ log_print(file);
+
+ fputs("----------------------\n"
+ "BUFFER POOL AND MEMORY\n"
+ "----------------------\n", file);
+ fprintf(file,
+ "Total memory allocated " ULINTPF
+ "; in additional pool allocated " ULINTPF "\n",
+ ut_total_allocated_memory,
+ mem_pool_get_reserved(mem_comm_pool));
+
+ if (srv_use_awe) {
+ fprintf(file,
+ "In addition to that %lu MB of AWE memory allocated\n",
+ (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE)));
+ }
+
+ buf_print_io(file);
+
+ fputs("--------------\n"
+ "ROW OPERATIONS\n"
+ "--------------\n", file);
+ fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+ (long) srv_conc_n_threads,
+ (ulong) srv_conc_n_waiting_threads);
+ n_reserved = fil_space_get_n_reserved_extents(0);
+ if (n_reserved > 0) {
+ fprintf(file,
+ "%lu tablespace extents now reserved for B-tree split operations\n",
+ (ulong) n_reserved);
+ }
+
+#ifdef UNIV_LINUX
+ fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+ (ulong) srv_main_thread_process_no,
+ (ulong) srv_main_thread_id,
+ srv_main_thread_op_info);
+#else
+ fprintf(file, "Main thread id %lu, state: %s\n",
+ (ulong) srv_main_thread_id,
+ srv_main_thread_op_info);
+#endif
+ fprintf(file,
+ "Number of rows inserted " ULINTPF
+ ", updated " ULINTPF ", deleted " ULINTPF ", read " ULINTPF "\n",
+ srv_n_rows_inserted,
+ srv_n_rows_updated,
+ srv_n_rows_deleted,
+ srv_n_rows_read);
+ fprintf(file,
+ "%.2f inserts/s, %.2f updates/s, %.2f deletes/s, %.2f reads/s\n",
+ (srv_n_rows_inserted - srv_n_rows_inserted_old)
+ / time_elapsed,
+ (srv_n_rows_updated - srv_n_rows_updated_old)
+ / time_elapsed,
+ (srv_n_rows_deleted - srv_n_rows_deleted_old)
+ / time_elapsed,
+ (srv_n_rows_read - srv_n_rows_read_old)
+ / time_elapsed);
+
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+ srv_n_rows_read_old = srv_n_rows_read;
+
+ fputs("----------------------------\n"
+ "END OF INNODB MONITOR OUTPUT\n"
+ "============================\n", file);
+ mutex_exit(&srv_innodb_monitor_mutex);
+ fflush(file);
+}
+
+/**********************************************************************
+Function to pass InnoDB status variables to MySQL */
+
+void
+srv_export_innodb_status(void)
+{
+
+ mutex_enter(&srv_innodb_monitor_mutex);
+ export_vars.innodb_data_pending_reads= os_n_pending_reads;
+ export_vars.innodb_data_pending_writes= os_n_pending_writes;
+ export_vars.innodb_data_pending_fsyncs=
+ fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes;
+ export_vars.innodb_data_fsyncs= os_n_fsyncs;
+ export_vars.innodb_data_read= srv_data_read;
+ export_vars.innodb_data_reads= os_n_file_reads;
+ export_vars.innodb_data_writes= os_n_file_writes;
+ export_vars.innodb_data_written= srv_data_written;
+ export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets;
+ export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests;
+ export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free;
+ export_vars.innodb_buffer_pool_pages_flushed= srv_buf_pool_flushed;
+ export_vars.innodb_buffer_pool_reads= srv_buf_pool_reads;
+ export_vars.innodb_buffer_pool_read_ahead_rnd= srv_read_ahead_rnd;
+ export_vars.innodb_buffer_pool_read_ahead_seq= srv_read_ahead_seq;
+ export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU);
+ export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list);
+ export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free);
+ export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number();
+ export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size;
+ export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size -
+ UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free);
+ export_vars.innodb_page_size= UNIV_PAGE_SIZE;
+ export_vars.innodb_log_waits= srv_log_waits;
+ export_vars.innodb_os_log_written= srv_os_log_written;
+ export_vars.innodb_os_log_fsyncs= fil_n_log_flushes;
+ export_vars.innodb_os_log_pending_fsyncs= fil_n_pending_log_flushes;
+ export_vars.innodb_os_log_pending_writes= srv_os_log_pending_writes;
+ export_vars.innodb_log_write_requests= srv_log_write_requests;
+ export_vars.innodb_log_writes= srv_log_writes;
+ export_vars.innodb_dblwr_pages_written= srv_dblwr_pages_written;
+ export_vars.innodb_dblwr_writes= srv_dblwr_writes;
+ export_vars.innodb_pages_created= buf_pool->n_pages_created;
+ export_vars.innodb_pages_read= buf_pool->n_pages_read;
+ export_vars.innodb_pages_written= buf_pool->n_pages_written;
+ export_vars.innodb_row_lock_waits= srv_n_lock_wait_count;
+ export_vars.innodb_row_lock_current_waits= srv_n_lock_wait_current_count;
+ export_vars.innodb_row_lock_time= srv_n_lock_wait_time / 10000;
+ if (srv_n_lock_wait_count > 0) {
+ export_vars.innodb_row_lock_time_avg = (ulint)
+ (srv_n_lock_wait_time / 10000 / srv_n_lock_wait_count);
+ } else {
+ export_vars.innodb_row_lock_time_avg = 0;
+ }
+ export_vars.innodb_row_lock_time_max= srv_n_lock_max_wait_time / 10000;
+ export_vars.innodb_rows_read= srv_n_rows_read;
+ export_vars.innodb_rows_inserted= srv_n_rows_inserted;
+ export_vars.innodb_rows_updated= srv_n_rows_updated;
+ export_vars.innodb_rows_deleted= srv_n_rows_deleted;
+ mutex_exit(&srv_innodb_monitor_mutex);
+
+}
+
+/*************************************************************************
+A thread which wakes up threads whose lock wait may have lasted too long.
+This also prints the info output by various InnoDB monitors. */
+
+#ifndef __WIN__
+void*
+#else
+ulint
+#endif
+srv_lock_timeout_and_monitor_thread(
+/*================================*/
+ /* out: a dummy parameter */
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ srv_slot_t* slot;
+ double time_elapsed;
+ time_t current_time;
+ time_t last_table_monitor_time;
+ time_t last_monitor_time;
+ ibool some_waits;
+ double wait_time;
+ ulint i;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ UT_NOT_USED(arg);
+ srv_last_monitor_time = time(NULL);
+ last_table_monitor_time = time(NULL);
+ last_monitor_time = time(NULL);
+loop:
+ srv_lock_timeout_and_monitor_active = TRUE;
+
+ /* When someone is waiting for a lock, we wake up every second
+ and check if a timeout has passed for a lock wait */
+
+ os_thread_sleep(1000000);
+
+ /* In case mutex_exit is not a memory barrier, it is
+ theoretically possible some threads are left waiting though
+ the semaphore is already released. Wake up those threads: */
+
+ sync_arr_wake_threads_if_sema_free();
+
+ current_time = time(NULL);
+
+ time_elapsed = difftime(current_time, last_monitor_time);
+
+ if (time_elapsed > 15) {
+ last_monitor_time = time(NULL);
+
+ if (srv_print_innodb_monitor) {
+ srv_printf_innodb_monitor(stderr);
+ }
+
+ if (srv_innodb_status) {
+ mutex_enter(&srv_monitor_file_mutex);
+ rewind(srv_monitor_file);
+ srv_printf_innodb_monitor(srv_monitor_file);
+ os_file_set_eof(srv_monitor_file);
+ mutex_exit(&srv_monitor_file_mutex);
+ }
+
+ if (srv_print_innodb_tablespace_monitor
+ && difftime(current_time, last_table_monitor_time) > 60) {
+
+ last_table_monitor_time = time(NULL);
+
+ fputs("================================================\n",
+ stderr);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+ "================================================\n",
+ stderr);
+
+ fsp_print(0);
+ fputs("Validating tablespace\n", stderr);
+ fsp_validate(0);
+ fputs("Validation ok\n"
+ "---------------------------------------\n"
+ "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+ "=======================================\n",
+ stderr);
+ }
+
+ if (srv_print_innodb_table_monitor
+ && difftime(current_time, last_table_monitor_time) > 60) {
+
+ last_table_monitor_time = time(NULL);
+
+ fputs("===========================================\n", stderr);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" INNODB TABLE MONITOR OUTPUT\n"
+ "===========================================\n",
+ stderr);
+ dict_print();
+
+ fputs("-----------------------------------\n"
+ "END OF INNODB TABLE MONITOR OUTPUT\n"
+ "==================================\n",
+ stderr);
+ }
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ some_waits = FALSE;
+
+ /* Check of all slots if a thread is waiting there, and if it
+ has exceeded the time limit */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ if (slot->in_use) {
+ some_waits = TRUE;
+
+ wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+ if (srv_lock_wait_timeout < 100000000 &&
+ (wait_time > (double) srv_lock_wait_timeout
+ || wait_time < 0)) {
+
+ /* Timeout exceeded or a wrap-around in system
+ time counter: cancel the lock request queued
+ by the transaction and release possible
+ other transactions waiting behind; it is
+ possible that the lock has already been
+ granted: in that case do nothing */
+
+ if (thr_get_trx(slot->thr)->wait_lock) {
+ lock_cancel_waiting_and_release(
+ thr_get_trx(slot->thr)->wait_lock);
+ }
+ }
+ }
+ }
+
+ os_event_reset(srv_lock_timeout_thread_event);
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ if (some_waits || srv_print_innodb_monitor
+ || srv_print_innodb_lock_monitor
+ || srv_print_innodb_tablespace_monitor
+ || srv_print_innodb_table_monitor) {
+ goto loop;
+ }
+
+ /* No one was waiting for a lock and no monitor was active:
+ suspend this thread */
+
+ srv_lock_timeout_and_monitor_active = FALSE;
+
+#if 0
+ /* The following synchronisation is disabled, since
+ the InnoDB monitor output is to be updated every 15 seconds. */
+ os_event_wait(srv_lock_timeout_thread_event);
+#endif
+ goto loop;
+
+exit_func:
+ srv_lock_timeout_and_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+#ifndef __WIN__
+ return(NULL);
+#else
+ return(0);
+#endif
+}
+
+/*************************************************************************
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs. */
+
+#ifndef __WIN__
+void*
+#else
+ulint
+#endif
+srv_error_monitor_thread(
+/*=====================*/
+ /* out: a dummy parameter */
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ /* number of successive fatal timeouts observed */
+ ulint fatal_cnt = 0;
+ dulint old_lsn;
+ dulint new_lsn;
+
+ old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Error monitor thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+loop:
+ srv_error_monitor_active = TRUE;
+
+ /* Try to track a strange bug reported by Harald Fuchs and others,
+ where the lsn seems to decrease at times */
+
+ new_lsn = log_get_lsn();
+
+ if (ut_dulint_cmp(new_lsn, old_lsn) < 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: old log sequence number %lu %lu was greater\n"
+"InnoDB: than the new log sequence number %lu %lu!\n"
+"InnoDB: Please send a bug report to mysql@lists.mysql.com\n",
+ (ulong) ut_dulint_get_high(old_lsn),
+ (ulong) ut_dulint_get_low(old_lsn),
+ (ulong) ut_dulint_get_high(new_lsn),
+ (ulong) ut_dulint_get_low(new_lsn));
+ }
+
+ old_lsn = new_lsn;
+
+ if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+ /* We referesh InnoDB Monitor values so that averages are
+ printed from at most 60 last seconds */
+
+ srv_refresh_innodb_monitor_stats();
+ }
+
+ if (sync_array_print_long_waits()) {
+ fatal_cnt++;
+ if (fatal_cnt > 5) {
+
+ fprintf(stderr,
+"InnoDB: Error: semaphore wait has lasted > %lu seconds\n"
+"InnoDB: We intentionally crash the server, because it appears to be hung.\n",
+ srv_fatal_semaphore_wait_threshold);
+
+ ut_error;
+ }
+ } else {
+ fatal_cnt = 0;
+ }
+
+ /* Flush stderr so that a database user gets the output
+ to possible MySQL error file */
+
+ fflush(stderr);
+
+ os_thread_sleep(2000000);
+
+ if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
+
+ goto loop;
+ }
+
+ srv_error_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+#ifndef __WIN__
+ return(NULL);
+#else
+ return(0);
+#endif
+}
+
+/***********************************************************************
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+ srv_activity_count++;
+
+ if (srv_n_threads_active[SRV_MASTER] == 0) {
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_MASTER, 1);
+
+ mutex_exit(&kernel_mutex);
+ }
+}
+
+/***********************************************************************
+Wakes up the master thread if it is suspended or being suspended. */
+
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+ srv_activity_count++;
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_MASTER, 1);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************************
+The master thread controlling the server. */
+
+#ifndef __WIN__
+void*
+#else
+ulint
+#endif
+srv_master_thread(
+/*==============*/
+ /* out: a dummy parameter */
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ os_event_t event;
+ time_t last_flush_time;
+ time_t current_time;
+ ulint old_activity_count;
+ ulint n_pages_purged;
+ ulint n_bytes_merged;
+ ulint n_pages_flushed;
+ ulint n_bytes_archived;
+ ulint n_tables_to_drop;
+ ulint n_ios;
+ ulint n_ios_old;
+ ulint n_ios_very_old;
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ srv_main_thread_process_no = os_proc_get_number();
+ srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+ srv_table_reserve_slot(SRV_MASTER);
+
+ mutex_enter(&kernel_mutex);
+
+ srv_n_threads_active[SRV_MASTER]++;
+
+ mutex_exit(&kernel_mutex);
+
+ os_event_set(srv_sys->operational);
+loop:
+ /*****************************************************************/
+ /* ---- When there is database activity by users, we cycle in this
+ loop */
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ n_ios_very_old = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+ mutex_enter(&kernel_mutex);
+
+ /* Store the user activity counter at the start of this loop */
+ old_activity_count = srv_activity_count;
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+
+ goto suspend_thread;
+ }
+
+ /* ---- We run the following loop approximately once per second
+ when there is database activity */
+
+ skip_sleep = FALSE;
+
+ for (i = 0; i < 10; i++) {
+ n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+ srv_main_thread_op_info = "sleeping";
+
+ if (!skip_sleep) {
+
+ os_thread_sleep(1000000);
+ }
+
+ skip_sleep = FALSE;
+
+ /* ALTER TABLE in MySQL requires on Unix that the table handler
+ can drop tables lazily after there no longer are SELECT
+ queries to them. */
+
+ srv_main_thread_op_info = "doing background drop tables";
+
+ row_drop_tables_for_mysql_in_background();
+
+ srv_main_thread_op_info = "";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ goto background_loop;
+ }
+
+ /* We flush the log once in a second even if no commit
+ is issued or the we have specified in my.cnf no flush
+ at transaction commit */
+
+ srv_main_thread_op_info = "flushing log";
+ log_buffer_flush_to_disk();
+
+ srv_main_thread_op_info = "making checkpoint";
+ log_free_check();
+
+ /* If there were less than 5 i/os during the
+ one second sleep, we assume that there is free
+ disk i/o capacity available, and it makes sense to
+ do an insert buffer merge. */
+
+ n_pend_ios = buf_get_n_pending_ios()
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+ if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+ ibuf_contract_for_n_pages(TRUE, 5);
+
+ srv_main_thread_op_info = "flushing log";
+
+ log_buffer_flush_to_disk();
+ }
+
+ if (buf_get_modified_ratio_pct() >
+ srv_max_buf_pool_modified_pct) {
+
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
+ ut_dulint_max);
+
+ /* If we had to do the flush, it may have taken
+ even more than 1 second, and also, there may be more
+ to flush. Do not sleep 1 second during the next
+ iteration of this loop. */
+
+ skip_sleep = TRUE;
+ }
+
+ if (srv_activity_count == old_activity_count) {
+
+ /* There is no user activity at the moment, go to
+ the background loop */
+
+ goto background_loop;
+ }
+ }
+
+ /* ---- We perform the following code approximately once per
+ 10 seconds when there is database activity */
+
+#ifdef MEM_PERIODIC_CHECK
+ /* Check magic numbers of every allocated mem block once in 10
+ seconds */
+ mem_validate_all_blocks();
+#endif
+ /* If there were less than 200 i/os during the 10 second period,
+ we assume that there is free disk i/o capacity available, and it
+ makes sense to flush 100 pages. */
+
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+ if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+ buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
+
+ srv_main_thread_op_info = "flushing log";
+ log_buffer_flush_to_disk();
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+ ibuf_contract_for_n_pages(TRUE, 5);
+
+ srv_main_thread_op_info = "flushing log";
+ log_buffer_flush_to_disk();
+
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+
+ n_pages_purged = 1;
+
+ last_flush_time = time(NULL);
+
+ while (n_pages_purged) {
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ goto background_loop;
+ }
+
+ srv_main_thread_op_info = "purging";
+ n_pages_purged = trx_purge();
+
+ current_time = time(NULL);
+
+ if (difftime(current_time, last_flush_time) > 1) {
+ srv_main_thread_op_info = "flushing log";
+
+ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
+ }
+ }
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+
+ /* Flush a few oldest pages to make a new checkpoint younger */
+
+ if (buf_get_modified_ratio_pct() > 70) {
+
+ /* If there are lots of modified pages in the buffer pool
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
+ ut_dulint_max);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
+ ut_dulint_max);
+ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+ /* Make a new checkpoint about once in 10 seconds */
+
+ log_checkpoint(TRUE, FALSE);
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+
+ /* ---- When there is database activity, we jump from here back to
+ the start of loop */
+
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ /* If the database is quiet, we enter the background loop */
+
+ /*****************************************************************/
+background_loop:
+ /* ---- In this loop we run background operations when the server
+ is quiet from user activity. Also in the case of a shutdown, we
+ loop here, flushing the buffer pool to the data files. */
+
+ /* The server has been quiet for a while: start running background
+ operations */
+
+ srv_main_thread_op_info = "doing background drop tables";
+
+ n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+ if (n_tables_to_drop > 0) {
+ /* Do not monopolize the CPU even if there are tables waiting
+ in the background drop queue. (It is essentially a bug if
+ MySQL tries to drop a table while there are still open handles
+ to it and we had to put it to the background drop queue.) */
+
+ os_thread_sleep(100000);
+ }
+
+ srv_main_thread_op_info = "purging";
+
+ /* Run a full purge */
+
+ n_pages_purged = 1;
+
+ last_flush_time = time(NULL);
+
+ while (n_pages_purged) {
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ break;
+ }
+
+ srv_main_thread_op_info = "purging";
+ n_pages_purged = trx_purge();
+
+ current_time = time(NULL);
+
+ if (difftime(current_time, last_flush_time) > 1) {
+ srv_main_thread_op_info = "flushing log";
+
+ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
+ }
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ n_bytes_merged = 0;
+ } else {
+ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+flush_loop:
+ srv_main_thread_op_info = "flushing buffer pool pages";
+
+ if (!srv_very_fast_shutdown) {
+ n_pages_flushed =
+ buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
+ } else {
+ /* In a 'very fast' shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+
+ n_pages_flushed = 0;
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+ srv_main_thread_op_info = "waiting for buffer pool flush to end";
+ buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+ srv_main_thread_op_info = "flushing log";
+
+ log_buffer_flush_to_disk();
+
+ srv_main_thread_op_info = "making checkpoint";
+
+ log_checkpoint(TRUE, FALSE);
+
+ if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) {
+
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+ goto flush_loop;
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+/*
+ srv_main_thread_op_info = "archiving log (if log archive is on)";
+
+ log_archive_do(FALSE, &n_bytes_archived);
+*/
+ n_bytes_archived = 0;
+
+ /* Keep looping in the background loop if still work to do */
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ if (n_tables_to_drop + n_pages_flushed
+ + n_bytes_archived != 0) {
+
+ /* If we are doing a fast shutdown (= the default)
+ we do not do purge or insert buffer merge. But we
+ flush the buffer pool completely to disk.
+ In a 'very fast' shutdown we do not flush the buffer
+ pool to data files: we have set n_pages_flushed to
+ 0 artificially. */
+
+ goto background_loop;
+ }
+ } else if (n_tables_to_drop +
+ n_pages_purged + n_bytes_merged + n_pages_flushed
+ + n_bytes_archived != 0) {
+ /* In a 'slow' shutdown we run purge and the insert buffer
+ merge to completion */
+
+ goto background_loop;
+ }
+
+ /* There is no work for background operations either: suspend
+ master thread to wait for more server activity */
+
+suspend_thread:
+ srv_main_thread_op_info = "suspending";
+
+ mutex_enter(&kernel_mutex);
+
+ if (row_get_background_drop_list_len_low() > 0) {
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ event = srv_suspend_thread();
+
+ mutex_exit(&kernel_mutex);
+
+ srv_main_thread_op_info = "waiting for server activity";
+
+ os_event_wait(event);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ /* This is only extra safety, the thread should exit
+ already when the event wait ends */
+
+ os_thread_exit(NULL);
+ }
+
+ /* When there is user activity, InnoDB will set the event and the main
+ thread goes back to loop: */
+
+ goto loop;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit.
+ The thread actually never comes here because it is exited in an
+ os_event_wait(). */
+
+ os_thread_exit(NULL);
+
+#ifndef __WIN__
+ return(NULL); /* Not reached */
+#else
+ return(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */