diff options
author | Jan Lindström <jplindst@mariadb.org> | 2014-03-03 14:27:56 +0200 |
---|---|---|
committer | Jan Lindström <jplindst@mariadb.org> | 2014-03-03 14:27:56 +0200 |
commit | 96100d6652831d6423cc59bb3f065ab7f6f0cf85 (patch) | |
tree | 3a6c9864c9add695428241931c97899fa7c9a500 /storage/innobase/buf/buf0dblwr.cc | |
parent | b67892cf59872867514709784c54526434784ea9 (diff) | |
parent | 4b3cf4aa26833206c650be2e40caebf6191d5302 (diff) | |
download | mariadb-git-96100d6652831d6423cc59bb3f065ab7f6f0cf85.tar.gz |
Merge: lp:maria/10.0 latest.
Diffstat (limited to 'storage/innobase/buf/buf0dblwr.cc')
-rw-r--r-- | storage/innobase/buf/buf0dblwr.cc | 198 |
1 files changed, 121 insertions, 77 deletions
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 2ae67d8a41e..305e2ebba94 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -1,7 +1,7 @@ /***************************************************************************** -Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -39,11 +39,6 @@ Created 2011/12/19 #ifndef UNIV_HOTBACKUP -/** Time in milliseconds that we sleep when unable to find a slot in -the doublewrite buffer or when we have to wait for a running batch -to end. */ -#define TRX_DOUBLEWRITE_BATCH_POLL_DELAY 10000 - #ifdef UNIV_PFS_MUTEX /* Key to register the mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key; @@ -105,6 +100,25 @@ buf_dblwr_get( return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE); } +/********************************************************************//** +Flush a batch of writes to the datafiles that have already been +written to the dblwr buffer on disk. */ +UNIV_INLINE +void +buf_dblwr_sync_datafiles() +/*======================*/ +{ + /* Wake possible simulated aio thread to actually post the + writes to the operating system */ + os_aio_simulated_wake_handler_threads(); + + /* Wait that all async writes to tablespaces have been posted to + the OS */ + os_aio_wait_until_no_pending_writes(); + + /* Now we flush the data to disk (for example, with fsync) */ + fil_flush_file_spaces(FIL_TABLESPACE); +} /****************************************************************//** Creates or initialializes the doublewrite buffer at a database start. */ @@ -132,6 +146,8 @@ buf_dblwr_init( mutex_create(buf_dblwr_mutex_key, &buf_dblwr->mutex, SYNC_DOUBLEWRITE); + buf_dblwr->b_event = os_event_create(); + buf_dblwr->s_event = os_event_create(); buf_dblwr->first_free = 0; buf_dblwr->s_reserved = 0; buf_dblwr->b_reserved = 0; @@ -141,8 +157,8 @@ buf_dblwr_init( buf_dblwr->block2 = mach_read_from_4( doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); - buf_dblwr->in_use = static_cast<ibool*>( - mem_zalloc(buf_size * sizeof(ibool))); + buf_dblwr->in_use = static_cast<bool*>( + mem_zalloc(buf_size * sizeof(bool))); buf_dblwr->write_buf_unaligned = static_cast<byte*>( ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE)); @@ -368,6 +384,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, UNIV_PAGE_SIZE, read_buf, NULL, 0); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -401,9 +418,10 @@ buf_dblwr_init_or_restore_pages( /* Read the pages from the doublewrite buffer to memory */ - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0, + fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf, NULL, 0); + fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, @@ -474,7 +492,7 @@ buf_dblwr_init_or_restore_pages( ulint zip_size = fil_space_get_zip_size(space_id); /* Read in the actual page from the file */ - fil_io(OS_FILE_READ, TRUE, space_id, zip_size, + fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, read_buf, NULL, 0); @@ -526,7 +544,7 @@ buf_dblwr_init_or_restore_pages( doublewrite buffer to the intended position */ - fil_io(OS_FILE_WRITE, TRUE, space_id, + fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL, 0); @@ -558,6 +576,8 @@ buf_dblwr_free(void) ut_ad(buf_dblwr->s_reserved == 0); ut_ad(buf_dblwr->b_reserved == 0); + os_event_free(buf_dblwr->b_event); + os_event_free(buf_dblwr->s_event); ut_free(buf_dblwr->write_buf_unaligned); buf_dblwr->write_buf_unaligned = NULL; @@ -573,38 +593,68 @@ buf_dblwr_free(void) } /********************************************************************//** -Updates the doublewrite buffer when an IO request that is part of an -LRU or flush batch is completed. */ +Updates the doublewrite buffer when an IO request is completed. */ UNIV_INTERN void -buf_dblwr_update(void) -/*==================*/ +buf_dblwr_update( +/*=============*/ + const buf_page_t* bpage, /*!< in: buffer block descriptor */ + buf_flush_t flush_type)/*!< in: flush type */ { if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { return; } - mutex_enter(&buf_dblwr->mutex); + switch (flush_type) { + case BUF_FLUSH_LIST: + case BUF_FLUSH_LRU: + mutex_enter(&buf_dblwr->mutex); - ut_ad(buf_dblwr->batch_running); - ut_ad(buf_dblwr->b_reserved > 0); - ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); + ut_ad(buf_dblwr->batch_running); + ut_ad(buf_dblwr->b_reserved > 0); + ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); - buf_dblwr->b_reserved--; - if (buf_dblwr->b_reserved == 0) { + buf_dblwr->b_reserved--; + + if (buf_dblwr->b_reserved == 0) { + mutex_exit(&buf_dblwr->mutex); + /* This will finish the batch. Sync data files + to the disk. */ + fil_flush_file_spaces(FIL_TABLESPACE); + mutex_enter(&buf_dblwr->mutex); + + /* We can now reuse the doublewrite memory buffer: */ + buf_dblwr->first_free = 0; + buf_dblwr->batch_running = false; + os_event_set(buf_dblwr->b_event); + } mutex_exit(&buf_dblwr->mutex); - /* This will finish the batch. Sync data files - to the disk. */ - fil_flush_file_spaces(FIL_TABLESPACE); - mutex_enter(&buf_dblwr->mutex); + break; + case BUF_FLUSH_SINGLE_PAGE: + { + const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + ulint i; + mutex_enter(&buf_dblwr->mutex); + for (i = srv_doublewrite_batch_size; i < size; ++i) { + if (buf_dblwr->buf_block_arr[i] == bpage) { + buf_dblwr->s_reserved--; + buf_dblwr->buf_block_arr[i] = NULL; + buf_dblwr->in_use[i] = false; + break; + } + } - /* We can now reuse the doublewrite memory buffer: */ - buf_dblwr->first_free = 0; - buf_dblwr->batch_running = FALSE; + /* The block we are looking for must exist as a + reserved block. */ + ut_a(i < size); + } + os_event_set(buf_dblwr->s_event); + mutex_exit(&buf_dblwr->mutex); + break; + case BUF_FLUSH_N_TYPES: + ut_error; } - - mutex_exit(&buf_dblwr->mutex); } /********************************************************************//** @@ -699,18 +749,19 @@ static void buf_dblwr_write_block_to_datafile( /*==============================*/ - const buf_page_t* bpage) /*!< in: page to write */ + const buf_page_t* bpage, /*!< in: page to write */ + bool sync) /*!< in: true if sync IO + is requested */ { ut_a(bpage); ut_a(buf_page_in_file(bpage)); - /* Increment the counter of I/O operations used - for selecting LRU policy. */ - buf_LRU_stat_inc_io(); + const ulint flags = sync + ? OS_FILE_WRITE + : OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER; if (bpage->zip.data) { - fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, - FALSE, buf_page_get_space(bpage), + fil_io(flags, sync, buf_page_get_space(bpage), buf_page_get_zip_size(bpage), buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), @@ -725,8 +776,7 @@ buf_dblwr_write_block_to_datafile( ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); buf_dblwr_check_page_lsn(block->frame); - fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, - FALSE, buf_block_get_space(block), 0, + fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, (void*) block->frame, (void*) block, (ulint *)&bpage->write_size); } @@ -748,12 +798,12 @@ buf_dblwr_flush_buffered_writes(void) if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { /* Sync the writes to the disk. */ - buf_flush_sync_datafiles(); + buf_dblwr_sync_datafiles(); return; } try_again: - mutex_enter(&(buf_dblwr->mutex)); + mutex_enter(&buf_dblwr->mutex); /* Write first to doublewrite buffer blocks. We use synchronous aio and thus know that file write has been completed when the @@ -761,17 +811,18 @@ try_again: if (buf_dblwr->first_free == 0) { - mutex_exit(&(buf_dblwr->mutex)); + mutex_exit(&buf_dblwr->mutex); return; } if (buf_dblwr->batch_running) { - mutex_exit(&buf_dblwr->mutex); - /* Another thread is running the batch right now. Wait for it to finish. */ - os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event); + mutex_exit(&buf_dblwr->mutex); + + os_event_wait_low(buf_dblwr->b_event, sig_count); goto try_again; } @@ -780,7 +831,7 @@ try_again: /* Disallow anyone else to post to doublewrite buffer or to start another batch of flushing. */ - buf_dblwr->batch_running = TRUE; + buf_dblwr->batch_running = true; first_free = buf_dblwr->first_free; /* Now safe to release the mutex. Note that though no other @@ -819,7 +870,7 @@ try_again: len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, buf_dblwr->first_free) * UNIV_PAGE_SIZE; - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, (void*) write_buf, NULL, 0); @@ -835,7 +886,7 @@ try_again: write_buf = buf_dblwr->write_buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, (void*) write_buf, NULL, 0); @@ -865,7 +916,7 @@ flush: ut_ad(first_free == buf_dblwr->first_free); for (ulint i = 0; i < first_free; i++) { buf_dblwr_write_block_to_datafile( - buf_dblwr->buf_block_arr[i]); + buf_dblwr->buf_block_arr[i], false); } /* Wake possible simulated aio thread to actually post the @@ -890,12 +941,11 @@ buf_dblwr_add_to_batch( ut_a(buf_page_in_file(bpage)); try_again: - mutex_enter(&(buf_dblwr->mutex)); + mutex_enter(&buf_dblwr->mutex); ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size); if (buf_dblwr->batch_running) { - mutex_exit(&buf_dblwr->mutex); /* This not nearly as bad as it looks. There is only page_cleaner thread which does background flushing @@ -903,7 +953,10 @@ try_again: point. The only exception is when a user thread is forced to do a flush batch because of a sync checkpoint. */ - os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event); + mutex_exit(&buf_dblwr->mutex); + + os_event_wait_low(buf_dblwr->b_event, sig_count); goto try_again; } @@ -968,7 +1021,8 @@ UNIV_INTERN void buf_dblwr_write_single_page( /*========================*/ - buf_page_t* bpage) /*!< in: buffer block to write */ + buf_page_t* bpage, /*!< in: buffer block to write */ + bool sync) /*!< in: true if sync IO requested */ { ulint n_slots; ulint size; @@ -1005,11 +1059,12 @@ retry: mutex_enter(&buf_dblwr->mutex); if (buf_dblwr->s_reserved == n_slots) { + /* All slots are reserved. */ + ib_int64_t sig_count = + os_event_reset(buf_dblwr->s_event); mutex_exit(&buf_dblwr->mutex); - /* All slots are reserved. Since it involves two IOs - during the processing a sleep of 10ms should be - enough. */ - os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + os_event_wait_low(buf_dblwr->s_event, sig_count); + goto retry; } @@ -1022,9 +1077,14 @@ retry: /* We are guaranteed to find a slot. */ ut_a(i < size); - buf_dblwr->in_use[i] = TRUE; + buf_dblwr->in_use[i] = true; buf_dblwr->s_reserved++; buf_dblwr->buf_block_arr[i] = bpage; + + /* increment the doublewrite flushed pages counter */ + srv_stats.dblwr_pages_written.inc(); + srv_stats.dblwr_writes.inc(); + mutex_exit(&buf_dblwr->mutex); /* Lets see if we are going to write in the first or second @@ -1054,14 +1114,14 @@ retry: memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i + zip_size, 0, UNIV_PAGE_SIZE - zip_size); - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ - fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, NULL, 0); @@ -1073,22 +1133,6 @@ retry: /* We know that the write has been flushed to disk now and during recovery we will find it in the doublewrite buffer blocks. Next do the write to the intended position. */ - buf_dblwr_write_block_to_datafile(bpage); - - /* Sync the writes to the disk. */ - buf_flush_sync_datafiles(); - - mutex_enter(&buf_dblwr->mutex); - - buf_dblwr->s_reserved--; - buf_dblwr->buf_block_arr[i] = NULL; - buf_dblwr->in_use[i] = FALSE; - - /* increment the doublewrite flushed pages counter */ - srv_stats.dblwr_pages_written.inc(); - srv_stats.dblwr_writes.inc(); - - mutex_exit(&(buf_dblwr->mutex)); - + buf_dblwr_write_block_to_datafile(bpage, sync); } #endif /* !UNIV_HOTBACKUP */ |