diff options
author | Vasil Dimov <vasil.dimov@oracle.com> | 2010-04-19 20:53:16 +0300 |
---|---|---|
committer | Vasil Dimov <vasil.dimov@oracle.com> | 2010-04-19 20:53:16 +0300 |
commit | c7525a01306e49897c52ea7e9844697f8f8379a1 (patch) | |
tree | 0c1af6979d3c01e38c801d71a2fc74150cc30667 /storage/innobase | |
parent | 49dc3a7b325135c8cd2f93c213a48497d1b57dae (diff) | |
parent | ed30b504d2fd9c14af243248f5a94ae5488ad026 (diff) | |
download | mariadb-git-c7525a01306e49897c52ea7e9844697f8f8379a1.tar.gz |
Merge from innodb-branches-innodb+ (2)
Diffstat (limited to 'storage/innobase')
39 files changed, 2827 insertions, 1408 deletions
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 8589d415131..6cc9b48936a 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -952,6 +952,7 @@ btr_page_reorganize_low( dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(&block->page); page_t* page = buf_block_get_frame(block); page_zip_des_t* page_zip = buf_block_get_page_zip(block); buf_block_t* temp_block; @@ -982,7 +983,7 @@ btr_page_reorganize_low( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(buf_pool, 0); #else /* !UNIV_HOTBACKUP */ ut_ad(block == back_block1); temp_block = back_block2; diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 3ca2b02bb4b..57d6973f623 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -3882,14 +3882,15 @@ btr_blob_free( if there is one */ mtr_t* mtr) /*!< in: mini-transaction to commit */ { - ulint space = buf_block_get_space(block); - ulint page_no = buf_block_get_page_no(block); + buf_pool_t* buf_pool = buf_pool_from_block(block); + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); mtr_commit(mtr); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); mutex_enter(&block->mutex); /* Only free the block if it is still allocated to @@ -3910,7 +3911,7 @@ btr_blob_free( } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); mutex_exit(&block->mutex); } diff --git a/storage/innobase/btr/btr0sea.c b/storage/innobase/btr/btr0sea.c index 7f8a9af1dd8..3f130405810 100644 --- a/storage/innobase/btr/btr0sea.c +++ b/storage/innobase/btr/btr0sea.c @@ -150,7 +150,7 @@ btr_search_check_free_space_in_heap(void) be enough free space in the hash table. */ if (heap->free_block == NULL) { - buf_block_t* block = buf_block_alloc(0); + buf_block_t* block = buf_block_alloc(NULL, 0); rw_lock_x_lock(&btr_search_latch); @@ -825,6 +825,7 @@ btr_search_guess_on_hash( RW_S_LATCH, RW_X_LATCH, or 0 */ mtr_t* mtr) /*!< in: mtr */ { + buf_pool_t* buf_pool; buf_block_t* block; rec_t* rec; ulint fold; @@ -983,7 +984,7 @@ btr_search_guess_on_hash( /* Increment the page get statistics though we did not really fix the page: for user info only */ - + buf_pool = buf_pool_from_bpage(&block->page); buf_pool->stat.n_page_gets++; return(TRUE); @@ -1760,7 +1761,7 @@ btr_search_validate(void) rec_offs_init(offsets_); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + buf_pool_mutex_enter_all(); cell_count = hash_get_n_cells(btr_search_sys->hash_index); @@ -1768,11 +1769,11 @@ btr_search_validate(void) /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if ((i != 0) && ((i % chunk_size) == 0)) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + buf_pool_mutex_enter_all(); } node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; @@ -1781,6 +1782,9 @@ btr_search_validate(void) const buf_block_t* block = buf_block_align(node->data); const buf_block_t* hash_block; + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_bpage((buf_page_t*) block); if (UNIV_LIKELY(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE)) { @@ -1791,6 +1795,7 @@ btr_search_validate(void) (BUF_BLOCK_REMOVE_HASH, see the assertion and the comment below) */ hash_block = buf_block_hash_get( + buf_pool, buf_block_get_space(block), buf_block_get_page_no(block)); } else { @@ -1879,11 +1884,11 @@ btr_search_validate(void) /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if (i != 0) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + buf_pool_mutex_enter_all(); } if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { @@ -1891,7 +1896,7 @@ btr_search_validate(void) } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); diff --git a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c index 7118cb376ab..695aed2d0cb 100644 --- a/storage/innobase/buf/buf0buddy.c +++ b/storage/innobase/buf/buf0buddy.c @@ -34,17 +34,6 @@ Created December 2006 by Marko Makela #include "buf0flu.h" #include "page0zip.h" -/* Statistic counters */ - -#ifdef UNIV_DEBUG -/** Number of frames allocated from the buffer pool to the buddy system. -Protected by buf_pool_mutex. */ -static ulint buf_buddy_n_frames; -#endif /* UNIV_DEBUG */ -/** Statistics of the buddy system, indexed by block size. -Protected by buf_pool_mutex. */ -UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1]; - /**********************************************************************//** Get the offset of the buddy of a compressed page frame. @return the buddy relative of page */ @@ -73,8 +62,10 @@ UNIV_INLINE void buf_buddy_add_to_free( /*==================*/ - buf_page_t* bpage, /*!< in,own: block to be freed */ - ulint i) /*!< in: index of buf_pool->zip_free[] */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in,own: block to be freed */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ { #ifdef UNIV_DEBUG_VALGRIND buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); @@ -82,7 +73,7 @@ buf_buddy_add_to_free( if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); #endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); @@ -99,8 +90,10 @@ UNIV_INLINE void buf_buddy_remove_from_free( /*=======================*/ - buf_page_t* bpage, /*!< in: block to be removed */ - ulint i) /*!< in: index of buf_pool->zip_free[] */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in: block to be removed */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ { #ifdef UNIV_DEBUG_VALGRIND buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); @@ -113,7 +106,7 @@ buf_buddy_remove_from_free( ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); #endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); @@ -130,11 +123,12 @@ static void* buf_buddy_alloc_zip( /*================*/ - ulint i) /*!< in: index of buf_pool->zip_free[] */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint i) /*!< in: index of buf_pool->zip_free[] */ { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(i < BUF_BUDDY_SIZES); #ifndef UNIV_DEBUG_VALGRIND @@ -149,19 +143,19 @@ buf_buddy_alloc_zip( UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); - buf_buddy_remove_from_free(bpage, i); + buf_buddy_remove_from_free(buf_pool, bpage, i); } else if (i + 1 < BUF_BUDDY_SIZES) { /* Attempt to split. */ - bpage = buf_buddy_alloc_zip(i + 1); + bpage = buf_buddy_alloc_zip(buf_pool, i + 1); if (bpage) { buf_page_t* buddy = (buf_page_t*) (((char*) bpage) + (BUF_BUDDY_LOW << i)); - ut_ad(!buf_pool_contains_zip(buddy)); + ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); ut_d(memset(buddy, i, BUF_BUDDY_LOW << i)); buddy->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_add_to_free(buddy, i); + buf_buddy_add_to_free(buf_pool, buddy, i); } } @@ -182,14 +176,15 @@ static void buf_buddy_block_free( /*=================*/ - void* buf) /*!< in: buffer frame to deallocate */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf) /*!< in: buffer frame to deallocate */ { const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); buf_page_t* bpage; buf_block_t* block; - ut_ad(buf_pool_mutex_own()); - ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, @@ -211,8 +206,8 @@ buf_buddy_block_free( buf_LRU_block_free_non_file_page(block); mutex_exit(&block->mutex); - ut_ad(buf_buddy_n_frames > 0); - ut_d(buf_buddy_n_frames--); + ut_ad(buf_pool->buddy_n_frames > 0); + ut_d(buf_pool->buddy_n_frames--); } /**********************************************************************//** @@ -223,9 +218,10 @@ buf_buddy_block_register( /*=====================*/ buf_block_t* block) /*!< in: buffer frame to allocate */ { + buf_pool_t* buf_pool = buf_pool_from_block(block); const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(buf_pool_mutex_own()); - ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); buf_block_set_state(block, BUF_BLOCK_MEMORY); @@ -238,7 +234,7 @@ buf_buddy_block_register( ut_d(block->page.in_zip_hash = TRUE); HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); - ut_d(buf_buddy_n_frames++); + ut_d(buf_pool->buddy_n_frames++); } /**********************************************************************//** @@ -248,10 +244,12 @@ static void* buf_buddy_alloc_from( /*=================*/ - void* buf, /*!< in: a block that is free to use */ - ulint i, /*!< in: index of buf_pool->zip_free[] */ - ulint j) /*!< in: size of buf as an index - of buf_pool->zip_free[] */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: a block that is free to use */ + ulint i, /*!< in: index of + buf_pool->zip_free[] */ + ulint j) /*!< in: size of buf as an index + of buf_pool->zip_free[] */ { ulint offs = BUF_BUDDY_LOW << j; ut_ad(j <= BUF_BUDDY_SIZES); @@ -275,7 +273,7 @@ buf_buddy_alloc_from( ut_list_node_313) == BUF_BLOCK_ZIP_FREE))); #endif /* !UNIV_DEBUG_VALGRIND */ - buf_buddy_add_to_free(bpage, j); + buf_buddy_add_to_free(buf_pool, bpage, j); } return(buf); @@ -283,37 +281,39 @@ buf_buddy_alloc_from( /**********************************************************************//** Allocate a block. The thread calling this function must hold -buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. +buf_pool->mutex and must not hold buf_pool_zip_mutex or any block->mutex. +The buf_pool->mutex may only be released and reacquired if lru != NULL. @return allocated block, possibly NULL if lru==NULL */ UNIV_INTERN void* buf_buddy_alloc_low( /*================*/ - ulint i, /*!< in: index of buf_pool->zip_free[], - or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that will be assigned - TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint i, /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /*!< in: pointer to a variable that + will be assigned TRUE if storage was + allocated from the LRU list and + buf_pool->mutex was temporarily + released, or NULL if the LRU list + should not be used */ { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); - ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ - block = buf_buddy_alloc_zip(i); + block = buf_buddy_alloc_zip(buf_pool, i); if (block) { - goto func_exit; } } /* Try allocating from the buf_pool->free list. */ - block = buf_LRU_get_free_only(); + block = buf_LRU_get_free_only(buf_pool); if (block) { @@ -326,18 +326,19 @@ buf_buddy_alloc_low( } /* Try replacing an uncompressed page in the buffer pool. */ - buf_pool_mutex_exit(); - block = buf_LRU_get_free_block(0); + buf_pool_mutex_exit(buf_pool); + block = buf_LRU_get_free_block(buf_pool, 0); *lru = TRUE; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); alloc_big: buf_buddy_block_register(block); - block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES); + block = buf_buddy_alloc_from( + buf_pool, block->frame, i, BUF_BUDDY_SIZES); func_exit: - buf_buddy_stat[i].used++; + buf_pool->buddy_stat[i].used++; return(block); } @@ -352,8 +353,9 @@ buf_buddy_relocate_block( buf_page_t* dpage) /*!< in: free block to relocate to */ { buf_page_t* b; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: @@ -371,10 +373,10 @@ buf_buddy_relocate_block( break; } - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); if (!buf_page_can_relocate(bpage)) { - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); return(FALSE); } @@ -393,7 +395,7 @@ buf_buddy_relocate_block( UNIV_MEM_INVALID(bpage, sizeof *bpage); - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); return(TRUE); } @@ -404,16 +406,18 @@ static ibool buf_buddy_relocate( /*===============*/ - void* src, /*!< in: block to relocate */ - void* dst, /*!< in: free block to relocate to */ - ulint i) /*!< in: index of buf_pool->zip_free[] */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* src, /*!< in: block to relocate */ + void* dst, /*!< in: free block to relocate to */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); - ut_ad(buf_pool_mutex_own()); - ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); UNIV_MEM_ASSERT_W(dst, size); @@ -443,6 +447,7 @@ buf_buddy_relocate( mach_read_from_4() calls here will only trigger bogus Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ bpage = buf_page_hash_get( + buf_pool, mach_read_from_4((const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID), mach_read_from_4((const byte*) src @@ -457,7 +462,7 @@ buf_buddy_relocate( return(FALSE); } - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); if (page_zip_get_size(&bpage->zip) != size) { /* The block is of different size. We would @@ -486,7 +491,7 @@ success: UNIV_MEM_INVALID(src, size); { buf_buddy_stat_t* buddy_stat - = &buf_buddy_stat[i]; + = &buf_pool->buddy_stat[i]; buddy_stat->relocated++; buddy_stat->relocated_usec += ut_time_us(NULL) - usec; @@ -513,32 +518,33 @@ UNIV_INTERN void buf_buddy_free_low( /*===============*/ - void* buf, /*!< in: block to be freed, must not be - pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], - or BUF_BUDDY_SIZES */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ { buf_page_t* bpage; buf_page_t* buddy; - ut_ad(buf_pool_mutex_own()); - ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); - ut_ad(buf_buddy_stat[i].used > 0); + ut_ad(buf_pool->buddy_stat[i].used > 0); - buf_buddy_stat[i].used--; + buf_pool->buddy_stat[i].used--; recombine: UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); if (i == BUF_BUDDY_SIZES) { - buf_buddy_block_free(buf); + buf_buddy_block_free(buf_pool, buf); return; } ut_ad(i < BUF_BUDDY_SIZES); ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); - ut_ad(!buf_pool_contains_zip(buf)); + ut_ad(!buf_pool_contains_zip(buf_pool, buf)); /* Try to combine adjacent blocks. */ @@ -564,10 +570,10 @@ recombine: if (bpage == buddy) { buddy_free: /* The buddy is free: recombine */ - buf_buddy_remove_from_free(bpage, i); + buf_buddy_remove_from_free(buf_pool, bpage, i); buddy_free2: ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE); - ut_ad(!buf_pool_contains_zip(buddy)); + ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); i++; buf = ut_align_down(buf, BUF_BUDDY_LOW << i); @@ -599,16 +605,16 @@ buddy_nonfree: buf_buddy_relocate() will overwrite bpage->list. */ UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); - buf_buddy_remove_from_free(bpage, i); + buf_buddy_remove_from_free(buf_pool, bpage, i); /* Try to relocate the buddy of buf to the free block. */ - if (buf_buddy_relocate(buddy, bpage, i)) { + if (buf_buddy_relocate(buf_pool, buddy, bpage, i)) { ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); goto buddy_free2; } - buf_buddy_add_to_free(bpage, i); + buf_buddy_add_to_free(buf_pool, bpage, i); /* Try to relocate the buddy of the free block to buf. */ buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage), @@ -629,7 +635,7 @@ buddy_nonfree: && ut_list_node_313 != buddy))); #endif /* !UNIV_DEBUG_VALGRIND */ - if (buf_buddy_relocate(buddy, buf, i)) { + if (buf_buddy_relocate(buf_pool, buddy, buf, i)) { buf = bpage; UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); @@ -692,5 +698,5 @@ buddy_nonfree: } #endif /* UNIV_DEBUG */ bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_add_to_free(bpage, i); + buf_buddy_add_to_free(buf_pool, bpage, i); } diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index c4b693e3ed2..7a86d12fa69 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -84,21 +84,21 @@ in the file along with the file page, resides in the control block. The buffer buf_pool contains a single mutex which protects all the control data structures of the buf_pool. The content of a buffer frame is protected by a separate read-write lock in its control block, though. -These locks can be locked and unlocked without owning the buf_pool mutex. +These locks can be locked and unlocked without owning the buf_pool->mutex. The OS events in the buf_pool struct can be waited for without owning the -buf_pool mutex. +buf_pool->mutex. -The buf_pool mutex is a hot-spot in main memory, causing a lot of +The buf_pool->mutex is a hot-spot in main memory, causing a lot of memory bus traffic on multiprocessor systems when processors alternately access the mutex. On our Pentium, the mutex is accessed maybe every 10 microseconds. We gave up the solution to have mutexes for each control block, for instance, because it seemed to be complicated. -A solution to reduce mutex contention of the buf_pool mutex is to +A solution to reduce mutex contention of the buf_pool->mutex is to create a separate mutex for the page hash table. On Pentium, accessing the hash table takes 2 microseconds, about half -of the total buf_pool mutex hold time. +of the total buf_pool->mutex hold time. Control blocks -------------- @@ -247,22 +247,12 @@ static const int WAIT_FOR_READ = 5000; static const ulint BUF_PAGE_READ_MAX_RETRIES = 100; /** The buffer buf_pool of the database */ -UNIV_INTERN buf_pool_t* buf_pool = NULL; - -/** mutex protecting the buffer pool struct and control blocks, except the -read-write lock in them */ -UNIV_INTERN mutex_t buf_pool_mutex; -/** mutex protecting the control blocks of compressed-only pages -(of type buf_page_t, not buf_block_t) */ -UNIV_INTERN mutex_t buf_pool_zip_mutex; +UNIV_INTERN buf_pool_t* buf_pool_ptr[MAX_BUFFER_POOLS]; #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG static ulint buf_dbg_counter = 0; /*!< This is used to insert validation - operations in excution in the + operations in execution in the debug version */ -/** Flag to forbid the release of the buffer pool mutex. -Protected by buf_pool_mutex. */ -UNIV_INTERN ulint buf_pool_mutex_exit_forbidden = 0; #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #ifdef UNIV_DEBUG /** If this is set TRUE, the program prints info whenever @@ -284,7 +274,6 @@ UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key; UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key; UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key; UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key; -UNIV_INTERN mysql_pfs_key_t flush_order_mutex_key; #endif /* UNIV_PFS_MUTEX */ #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK @@ -316,6 +305,140 @@ struct buf_chunk_struct{ #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. +@return oldest modification in pool, zero if none */ +UNIV_INTERN +ib_uint64_t +buf_pool_get_oldest_modification(void) +/*==================================*/ +{ + ulint i; + buf_page_t* bpage; + ib_uint64_t lsn = 0; + ib_uint64_t oldest_lsn = 0; + + /* When we traverse all the flush lists we don't want another + thread to add a dirty page to any flush list. */ + log_flush_order_mutex_enter(); + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + + if (bpage != NULL) { + ut_ad(bpage->in_flush_list); + lsn = bpage->oldest_modification; + } + + buf_flush_list_mutex_exit(buf_pool); + + if (!oldest_lsn || oldest_lsn > lsn) { + oldest_lsn = lsn; + } + } + + log_flush_order_mutex_exit(); + + /* The returned answer may be out of date: the flush_list can + change after the mutex has been released. */ + + return(oldest_lsn); +} + +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_list_len( +/*===================*/ + ulint* LRU_len, /*!< out: length of all LRU lists */ + ulint* free_len, /*!< out: length of all free lists */ + ulint* flush_list_len) /*!< out: length of all flush lists */ +{ + ulint i; + + *LRU_len = 0; + *free_len = 0; + *flush_list_len = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU); + *free_len += UT_LIST_GET_LEN(buf_pool->free); + *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list); + } +} + +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_stat( +/*===============*/ + buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */ +{ + ulint i; + + memset(tot_stat, 0, sizeof(*tot_stat)); + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_stat_t*buf_stat; + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_stat = &buf_pool->stat; + tot_stat->n_page_gets += buf_stat->n_page_gets; + tot_stat->n_pages_read += buf_stat->n_pages_read; + tot_stat->n_pages_written += buf_stat->n_pages_written; + tot_stat->n_pages_created += buf_stat->n_pages_created; + tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read; + tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted; + tot_stat->n_pages_made_young += buf_stat->n_pages_made_young; + + tot_stat->n_pages_not_made_young += + buf_stat->n_pages_not_made_young; + } +} + +/********************************************************************//** +Allocates a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +UNIV_INTERN +buf_block_t* +buf_block_alloc( +/*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint zip_size) /*!< in: compressed page size in bytes, + or 0 if uncompressed tablespace */ +{ + buf_block_t* block; + ulint index; + static ulint buf_pool_index; + + if (buf_pool == NULL) { + /* We are allocating memory from any buffer pool, ensure + we spread the grace on all buffer pool instances. */ + index = buf_pool_index++ % srv_buf_pool_instances; + buf_pool = buf_pool_from_array(index); + } + + block = buf_LRU_get_free_block(buf_pool, zip_size); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + return(block); +} + +/********************************************************************//** Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on 32-bit and 64-bit architectures. @@ -727,13 +850,15 @@ static void buf_block_init( /*===========*/ - buf_block_t* block, /*!< in: pointer to control block */ - byte* frame) /*!< in: pointer to buffer frame */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_block_t* block, /*!< in: pointer to control block */ + byte* frame) /*!< in: pointer to buffer frame */ { UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block); block->frame = frame; + block->page.buf_pool = buf_pool; block->page.state = BUF_BLOCK_NOT_USED; block->page.buf_fix_count = 0; block->page.io_fix = BUF_IO_NONE; @@ -789,6 +914,7 @@ static buf_chunk_t* buf_chunk_init( /*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ buf_chunk_t* chunk, /*!< out: chunk of buffers */ ulint mem_size) /*!< in: requested size in bytes */ { @@ -844,7 +970,7 @@ buf_chunk_init( for (i = chunk->size; i--; ) { - buf_block_init(block, frame); + buf_block_init(buf_pool, block, frame); #ifdef HAVE_purify /* Wipe contents of frame to eliminate a Purify warning */ @@ -852,7 +978,9 @@ buf_chunk_init( #endif /* Add the block to the free list */ UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + ut_d(block->page.in_free_list = TRUE); + ut_ad(buf_pool_from_block(block) == buf_pool); block++; frame += UNIV_PAGE_SIZE; @@ -879,9 +1007,6 @@ buf_chunk_contains_zip( buf_block_t* block; ulint i; - ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); - block = chunk->blocks; for (i = chunk->size; i--; block++) { @@ -902,12 +1027,16 @@ UNIV_INTERN buf_block_t* buf_pool_contains_zip( /*==================*/ - const void* data) /*!< in: pointer to compressed page */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* data) /*!< in: pointer to compressed page */ { ulint n; buf_chunk_t* chunk = buf_pool->chunks; + ut_ad(buf_pool); + ut_ad(buf_pool_mutex_own(buf_pool)); for (n = buf_pool->n_chunks; n--; chunk++) { + buf_block_t* block = buf_chunk_contains_zip(chunk, data); if (block) { @@ -931,9 +1060,6 @@ buf_chunk_not_freed( buf_block_t* block; ulint i; - ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); - block = chunk->blocks; for (i = chunk->size; i--; block++) { @@ -983,9 +1109,6 @@ buf_chunk_all_free( const buf_block_t* block; ulint i; - ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); - block = chunk->blocks; for (i = chunk->size; i--; block++) { @@ -1005,12 +1128,13 @@ static void buf_chunk_free( /*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ buf_chunk_t* chunk) /*!< out: chunk of buffers */ { buf_block_t* block; const buf_block_t* block_end; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); block_end = chunk->blocks + chunk->size; @@ -1038,55 +1162,83 @@ buf_chunk_free( } /********************************************************************//** -Creates the buffer pool. -@return own: buf_pool object, NULL if not enough memory or error */ +Set buffer pool size variables after resizing it */ +static +void +buf_pool_set_sizes(void) +/*====================*/ +{ + ulint i; + ulint curr_size = 0; + + buf_pool_mutex_enter_all(); + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + curr_size += buf_pool->curr_pool_size; + } + + srv_buf_pool_curr_size = curr_size; + srv_buf_pool_old_size = srv_buf_pool_size; + + buf_pool_mutex_exit_all(); +} + +/********************************************************************//** +Initialize a buffer pool instance. +@return DB_SUCCESS if all goes well. */ UNIV_INTERN -buf_pool_t* -buf_pool_init(void) -/*===============*/ +ulint +buf_pool_init_instance( +/*===================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint buf_pool_size, /*!< in: size in bytes */ + ulint instance_no) /*!< in: id of the instance */ { - buf_chunk_t* chunk; ulint i; - - buf_pool = mem_zalloc(sizeof(buf_pool_t)); + buf_chunk_t* chunk; /* 1. Initialize general fields ------------------------------- */ mutex_create(buf_pool_mutex_key, - &buf_pool_mutex, SYNC_BUF_POOL); + &buf_pool->mutex, SYNC_BUF_POOL); mutex_create(buf_pool_zip_mutex_key, - &buf_pool_zip_mutex, SYNC_BUF_BLOCK); + &buf_pool->zip_mutex, SYNC_BUF_BLOCK); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - buf_pool->n_chunks = 1; - buf_pool->chunks = chunk = mem_alloc(sizeof *chunk); + if (buf_pool_size > 0) { + buf_pool->n_chunks = 1; + buf_pool->chunks = chunk = mem_zalloc(sizeof *chunk); - UT_LIST_INIT(buf_pool->free); + UT_LIST_INIT(buf_pool->free); - if (!buf_chunk_init(chunk, srv_buf_pool_size)) { - mem_free(chunk); - mem_free(buf_pool); - buf_pool = NULL; - return(NULL); - } + if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) { + mem_free(chunk); + mem_free(buf_pool); - srv_buf_pool_old_size = srv_buf_pool_size; - buf_pool->curr_size = chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + buf_pool_mutex_exit(buf_pool); - buf_pool->page_hash = hash_create(2 * buf_pool->curr_size); - buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); + return(DB_ERROR); + } - buf_pool->last_printout_time = time(NULL); + buf_pool->instance_no = instance_no; + buf_pool->old_pool_size = buf_pool_size; + buf_pool->curr_size = chunk->size; + buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + buf_pool->page_hash = hash_create(2 * buf_pool->curr_size); + buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); + + buf_pool->last_printout_time = ut_time(); + } /* 2. Initialize flushing fields -------------------------------- */ mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST); - mutex_create(flush_order_mutex_key, &buf_pool->flush_order_mutex, - SYNC_BUF_FLUSH_ORDER); for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { buf_pool->no_flush[i] = os_event_create(NULL); @@ -1094,26 +1246,22 @@ buf_pool_init(void) /* 3. Initialize LRU fields --------------------------- */ - /* All fields are initialized by mem_zalloc(). */ - - buf_pool_mutex_exit(); - - btr_search_sys_create(buf_pool->curr_size - * UNIV_PAGE_SIZE / sizeof(void*) / 64); - /* 4. Initialize the buddy allocator fields */ /* All fields are initialized by mem_zalloc(). */ - return(buf_pool); + buf_pool_mutex_exit(buf_pool); + + return(DB_SUCCESS); } /********************************************************************//** -Frees the buffer pool at shutdown. This must not be invoked before -freeing all mutexes. */ -UNIV_INTERN +free one buffer pool instance */ +static void -buf_pool_free(void) -/*===============*/ +buf_pool_free_instance( +/*===================*/ + buf_pool_t* buf_pool) /* in,own: buffer pool instance + to free */ { buf_chunk_t* chunk; buf_chunk_t* chunks; @@ -1135,6 +1283,139 @@ buf_pool_free(void) } /********************************************************************//** +Creates the buffer pool. +@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ +UNIV_INTERN +ulint +buf_pool_init( +/*==========*/ + ulint total_size, /*!< in: size of the total pool in bytes */ + ulint n_instances) /*!< in: number of instances */ +{ + ulint i; + + /* We create an extra buffer pool instance, this instance is used + for flushing the flush lists, to keep track of n_flush for all + the buffer pools and also used as a waiting object during flushing. */ + for (i = 0; i < n_instances; i++) { + buf_pool_t* ptr; + ulint size; + + ptr = mem_zalloc(sizeof(*ptr)); + + size = total_size / n_instances; + + buf_pool_ptr[i] = ptr; + + if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) { + + mem_free(buf_pool_ptr[i]); + + /* Free all the instances created so far. */ + buf_pool_free(i); + + return(DB_ERROR); + } + } + + buf_pool_set_sizes(); + buf_LRU_old_ratio_update(100 * 3/ 8, FALSE); + + btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64); + + return(DB_SUCCESS); +} + +/********************************************************************//** +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free( +/*==========*/ + ulint n_instances) /*!< in: numbere of instances to free */ +{ + ulint i; + + for (i = 0; i < n_instances; i++) { + buf_pool_free_instance(buf_pool_from_array(i)); + buf_pool_ptr[i] = NULL; + } +} + +/********************************************************************//** +Drops adaptive hash index for a buffer pool instance. */ +static +void +buf_pool_drop_hash_index_instance( +/*==============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool* released_search_latch) /*!< out: flag for signalling + whether the search latch was + released */ +{ + buf_chunk_t* chunks = buf_pool->chunks; + buf_chunk_t* chunk = chunks + buf_pool->n_chunks; + + while (--chunk >= chunks) { + ulint i; + buf_block_t* block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + /* block->is_hashed cannot be modified + when we have an x-latch on btr_search_latch; + see the comment in buf0buf.h */ + + if (!block->is_hashed) { + continue; + } + + /* To follow the latching order, we + have to release btr_search_latch + before acquiring block->latch. */ + rw_lock_x_unlock(&btr_search_latch); + /* When we release the search latch, + we must rescan all blocks, because + some may become hashed again. */ + *released_search_latch = TRUE; + + rw_lock_x_lock(&block->lock); + + /* This should be guaranteed by the + callers, which will be holding + btr_search_enabled_mutex. */ + ut_ad(!btr_search_enabled); + + /* Because we did not buffer-fix the + block by calling buf_block_get_gen(), + it is possible that the block has been + allocated for some other use after + btr_search_latch was released above. + We do not care which file page the + block is mapped to. All we want to do + is to drop any hash entries referring + to the page. */ + + /* It is possible that + block->page.state != BUF_FILE_PAGE. + Even that does not matter, because + btr_search_drop_page_hash_index() will + check block->is_hashed before doing + anything. block->is_hashed can only + be set on uncompressed file pages. */ + + btr_search_drop_page_hash_index(block); + + rw_lock_x_unlock(&block->lock); + + rw_lock_x_lock(&btr_search_latch); + + ut_ad(!btr_search_enabled); + } + } +} + +/********************************************************************//** Drops the adaptive hash index. To prevent a livelock, this function is only to be called while holding btr_search_latch and while btr_search_enabled == FALSE. */ @@ -1151,69 +1432,19 @@ buf_pool_drop_hash_index(void) ut_ad(!btr_search_enabled); do { - buf_chunk_t* chunks = buf_pool->chunks; - buf_chunk_t* chunk = chunks + buf_pool->n_chunks; + ulint i; released_search_latch = FALSE; - while (--chunk >= chunks) { - buf_block_t* block = chunk->blocks; - ulint i = chunk->size; - - for (; i--; block++) { - /* block->is_hashed cannot be modified - when we have an x-latch on btr_search_latch; - see the comment in buf0buf.h */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; - if (buf_block_get_state(block) - != BUF_BLOCK_FILE_PAGE - || !block->is_hashed) { - continue; - } + buf_pool = buf_pool_from_array(i); - /* To follow the latching order, we - have to release btr_search_latch - before acquiring block->latch. */ - rw_lock_x_unlock(&btr_search_latch); - /* When we release the search latch, - we must rescan all blocks, because - some may become hashed again. */ - released_search_latch = TRUE; - - rw_lock_x_lock(&block->lock); - - /* This should be guaranteed by the - callers, which will be holding - btr_search_enabled_mutex. */ - ut_ad(!btr_search_enabled); - - /* Because we did not buffer-fix the - block by calling buf_block_get_gen(), - it is possible that the block has been - allocated for some other use after - btr_search_latch was released above. - We do not care which file page the - block is mapped to. All we want to do - is to drop any hash entries referring - to the page. */ - - /* It is possible that - block->page.state != BUF_FILE_PAGE. - Even that does not matter, because - btr_search_drop_page_hash_index() will - check block->is_hashed before doing - anything. block->is_hashed can only - be set on uncompressed file pages. */ - - btr_search_drop_page_hash_index(block); - - rw_lock_x_unlock(&block->lock); - - rw_lock_x_lock(&btr_search_latch); - - ut_ad(!btr_search_enabled); - } + buf_pool_drop_hash_index_instance( + buf_pool, &released_search_latch); } + } while (released_search_latch); } @@ -1232,16 +1463,18 @@ buf_relocate( { buf_page_t* b; ulint fold; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); ut_a(bpage->buf_fix_count == 0); ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_zip_hash); ut_ad(bpage->in_page_hash); - ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset)); - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(bpage == buf_page_hash_get(buf_pool, + bpage->space, bpage->offset)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); #ifdef UNIV_DEBUG switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: @@ -1300,12 +1533,13 @@ buf_relocate( } /********************************************************************//** -Shrinks the buffer pool. */ +Shrinks a buffer pool instance. */ static void -buf_pool_shrink( -/*============*/ - ulint chunk_size) /*!< in: number of pages to remove */ +buf_pool_shrink_instance( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint chunk_size) /*!< in: number of pages to remove */ { buf_chunk_t* chunks; buf_chunk_t* chunk; @@ -1314,11 +1548,11 @@ buf_pool_shrink( buf_chunk_t* max_chunk; buf_chunk_t* max_free_chunk; - ut_ad(!buf_pool_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); try_again: btr_search_disable(); /* Empty the adaptive hash index again */ - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); shrink_again: if (buf_pool->n_chunks <= 1) { @@ -1381,7 +1615,7 @@ shrink_again: mutex_enter(&block->mutex); /* The following calls will temporarily - release block->mutex and buf_pool_mutex. + release block->mutex and buf_pool->mutex. Therefore, we have to always retry, even if !dirty && !nonfree. */ @@ -1397,7 +1631,7 @@ shrink_again: mutex_exit(&block->mutex); } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* Request for a flush of the chunk if it helps. Do not flush if there are non-free blocks, since @@ -1406,10 +1640,10 @@ shrink_again: /* Avoid busy-waiting. */ os_thread_sleep(100000); } else if (dirty - && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0) - == ULINT_UNDEFINED) { + && buf_flush_LRU(buf_pool, dirty) + == ULINT_UNDEFINED) { - buf_flush_wait_batch_end(BUF_FLUSH_LRU); + buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); } goto try_again; @@ -1418,7 +1652,7 @@ shrink_again: max_size = max_free_size; max_chunk = max_free_chunk; - srv_buf_pool_old_size = srv_buf_pool_size; + buf_pool->old_pool_size = buf_pool->curr_pool_size; /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */ chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks); @@ -1430,9 +1664,9 @@ shrink_again: - (max_chunk + 1)); ut_a(buf_pool->curr_size > max_chunk->size); buf_pool->curr_size -= max_chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; chunk_size -= max_chunk->size; - buf_chunk_free(max_chunk); + buf_chunk_free(buf_pool, max_chunk); mem_free(buf_pool->chunks); buf_pool->chunks = chunks; buf_pool->n_chunks--; @@ -1442,29 +1676,53 @@ shrink_again: goto shrink_again; } + goto func_exit; func_done: - srv_buf_pool_old_size = srv_buf_pool_size; + buf_pool->old_pool_size = buf_pool->curr_pool_size; func_exit: - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); btr_search_enable(); } /********************************************************************//** -Rebuild buf_pool->page_hash. */ +Shrinks the buffer pool. */ static void -buf_pool_page_hash_rebuild(void) -/*============================*/ +buf_pool_shrink( +/*============*/ + ulint chunk_size) /*!< in: number of pages to remove */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + ulint instance_chunk_size; + + instance_chunk_size = chunk_size / srv_buf_pool_instances; + buf_pool = buf_pool_from_array(i); + buf_pool_shrink_instance(buf_pool, instance_chunk_size); + } + + buf_pool_set_sizes(); +} + +/********************************************************************//** +Rebuild buf_pool->page_hash for a buffer pool instance. */ +static +void +buf_pool_page_hash_rebuild_instance( +/*================================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { ulint i; - ulint n_chunks; + buf_page_t* b; buf_chunk_t* chunk; - hash_table_t* page_hash; + ulint n_chunks; hash_table_t* zip_hash; - buf_page_t* b; + hash_table_t* page_hash; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); /* Free, create, and populate the hash table. */ hash_table_free(buf_pool->page_hash); @@ -1517,7 +1775,7 @@ buf_pool_page_hash_rebuild(void) buf_page_address_fold(b->space, b->offset), b); } - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; b = UT_LIST_GET_NEXT(list, b)) { ut_ad(b->in_flush_list); @@ -1545,85 +1803,24 @@ buf_pool_page_hash_rebuild(void) } } - buf_flush_list_mutex_exit(); - buf_pool_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); + buf_pool_mutex_exit(buf_pool); } -/********************************************************************//** -Resizes the buffer pool. */ -UNIV_INTERN -void -buf_pool_resize(void) -/*=================*/ -{ - buf_pool_mutex_enter(); - - if (srv_buf_pool_old_size == srv_buf_pool_size) { - - buf_pool_mutex_exit(); - return; - } - - if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { - - buf_pool_mutex_exit(); - - /* Disable adaptive hash indexes and empty the index - in order to free up memory in the buffer pool chunks. */ - buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size) - / UNIV_PAGE_SIZE); - } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) { - - /* Enlarge the buffer pool by at least one megabyte */ - - ulint mem_size - = srv_buf_pool_size - srv_buf_pool_curr_size; - buf_chunk_t* chunks; - buf_chunk_t* chunk; - - chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks); - - memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks - * sizeof *chunks); - - chunk = &chunks[buf_pool->n_chunks]; - - if (!buf_chunk_init(chunk, mem_size)) { - mem_free(chunks); - } else { - buf_pool->curr_size += chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size - * UNIV_PAGE_SIZE; - mem_free(buf_pool->chunks); - buf_pool->chunks = chunks; - buf_pool->n_chunks++; - } - - srv_buf_pool_old_size = srv_buf_pool_size; - buf_pool_mutex_exit(); - } - - buf_pool_page_hash_rebuild(); -} - -/** Maximum number of concurrent buffer pool watches */ -#define BUF_POOL_WATCH_SIZE 1 -/** Sentinel records for buffer pool watches. Protected by buf_pool_mutex. */ -static buf_page_t buf_pool_watch[BUF_POOL_WATCH_SIZE]; - /******************************************************************** Determine if a block is a sentinel for a buffer pool watch. @return TRUE if a sentinel for a buffer pool watch, FALSE if not */ UNIV_INTERN ibool -buf_pool_watch_is( -/*==============*/ - const buf_page_t* bpage) /*!< in: block */ +buf_pool_watch_is_sentinel( +/*=======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + const buf_page_t* bpage) /*!< in: block */ { ut_ad(buf_page_in_file(bpage)); - if (UNIV_LIKELY(bpage < &buf_pool_watch[0] - || bpage >= &buf_pool_watch[BUF_POOL_WATCH_SIZE])) { + if (bpage < &buf_pool->watch[0] + || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) { ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE || bpage->zip.data != NULL); @@ -1653,13 +1850,14 @@ buf_pool_watch_set( { buf_page_t* bpage; ulint i; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); - bpage = buf_page_hash_get_low(space, offset, fold); + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); if (UNIV_LIKELY_NULL(bpage)) { - if (!buf_pool_watch_is(bpage)) { + if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) { /* The page was loaded meanwhile. */ return(bpage); } @@ -1669,7 +1867,7 @@ buf_pool_watch_set( } for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) { - bpage = &buf_pool_watch[i]; + bpage = &buf_pool->watch[i]; ut_ad(bpage->access_time == 0); ut_ad(bpage->newest_modification == 0); @@ -1685,7 +1883,7 @@ buf_pool_watch_set( /* bpage is pointing to buf_pool_watch[], which is protected by buf_pool_mutex. Normally, buf_page_t objects are protected by - buf_block_t::mutex or buf_pool_zip_mutex or both. */ + buf_block_t::mutex or buf_pool->zip_mutex or both. */ bpage->state = BUF_BLOCK_ZIP_PAGE; bpage->space = space; @@ -1715,6 +1913,123 @@ buf_pool_watch_set( return(NULL); } +/********************************************************************//** +Rebuild buf_pool->page_hash. */ +static +void +buf_pool_page_hash_rebuild(void) +/*============================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_page_hash_rebuild_instance(buf_pool_from_array(i)); + } +} + +/********************************************************************//** +Increase the buffer pool size of one buffer pool instance. */ +static +void +buf_pool_increase_instance( +/*=======================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instane */ + ulint change_size) /*!< in: new size of the pool */ +{ + buf_chunk_t* chunks; + buf_chunk_t* chunk; + + buf_pool_mutex_enter(buf_pool); + chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks); + + memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks * sizeof *chunks); + + chunk = &chunks[buf_pool->n_chunks]; + + if (!buf_chunk_init(buf_pool, chunk, change_size)) { + mem_free(chunks); + } else { + buf_pool->old_pool_size = buf_pool->curr_pool_size; + buf_pool->curr_size += chunk->size; + buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + mem_free(buf_pool->chunks); + buf_pool->chunks = chunks; + buf_pool->n_chunks++; + } + + buf_pool_mutex_exit(buf_pool); +} + +/********************************************************************//** +Increase the buffer pool size. */ +static +void +buf_pool_increase( +/*==============*/ + ulint change_size) +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_increase_instance( + buf_pool_from_array(i), + change_size / srv_buf_pool_instances); + } + + buf_pool_set_sizes(); +} + +/********************************************************************//** +Resizes the buffer pool. */ +UNIV_INTERN +void +buf_pool_resize(void) +/*=================*/ +{ + ulint change_size; + ulint min_change_size = 1048576 * srv_buf_pool_instances; + + buf_pool_mutex_enter_all(); + + if (srv_buf_pool_old_size == srv_buf_pool_size) { + + buf_pool_mutex_exit_all(); + + return; + + } else if (srv_buf_pool_curr_size + min_change_size + > srv_buf_pool_size) { + + change_size = (srv_buf_pool_curr_size - srv_buf_pool_size) + / UNIV_PAGE_SIZE; + + buf_pool_mutex_exit_all(); + + /* Disable adaptive hash indexes and empty the index + in order to free up memory in the buffer pool chunks. */ + buf_pool_shrink(change_size); + + } else if (srv_buf_pool_curr_size + min_change_size + < srv_buf_pool_size) { + + /* Enlarge the buffer pool by at least one megabyte */ + + change_size = srv_buf_pool_size - srv_buf_pool_curr_size; + + buf_pool_mutex_exit_all(); + + buf_pool_increase(change_size); + } else { + srv_buf_pool_size = srv_buf_pool_old_size; + + buf_pool_mutex_exit_all(); + + return; + } + + buf_pool_page_hash_rebuild(); +} + /****************************************************************//** Remove the sentinel block for the watch before replacing it with a real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice that @@ -1724,10 +2039,12 @@ static void buf_pool_watch_remove( /*==================*/ - ulint fold, /*!< in: buf_page_address_fold(space, offset) */ - buf_page_t* watch) /*!< in/out: sentinel for watch */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint fold, /*!< in: buf_page_address_fold( + space, offset) */ + buf_page_t* watch) /*!< in/out: sentinel for watch */ { - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch); ut_d(watch->in_page_hash = FALSE); @@ -1746,16 +2063,18 @@ buf_pool_watch_unset( ulint offset) /*!< in: page number */ { buf_page_t* bpage; - ulint fold = buf_page_address_fold(space, offset); + buf_pool_t* buf_pool = buf_pool_get(space, offset); + ulint fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(); - bpage = buf_page_hash_get_low(space, offset, fold); + buf_pool_mutex_enter(buf_pool); + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); /* The page must exist because buf_pool_watch_set() increments buf_fix_count. */ ut_a(bpage); - if (UNIV_UNLIKELY(!buf_pool_watch_is(bpage))) { + if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) { mutex_t* mutex = buf_page_get_mutex(bpage); + mutex_enter(mutex); ut_a(bpage->buf_fix_count > 0); bpage->buf_fix_count--; @@ -1764,11 +2083,11 @@ buf_pool_watch_unset( ut_a(bpage->buf_fix_count > 0); if (UNIV_LIKELY(!--bpage->buf_fix_count)) { - buf_pool_watch_remove(fold, bpage); + buf_pool_watch_remove(buf_pool, fold, bpage); } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } /****************************************************************//** @@ -1783,18 +2102,19 @@ buf_pool_watch_occurred( ulint space, /*!< in: space id */ ulint offset) /*!< in: page number */ { + ibool ret; buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); ulint fold = buf_page_address_fold(space, offset); - ibool ret; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - bpage = buf_page_hash_get_low(space, offset, fold); + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); /* The page must exist because buf_pool_watch_set() increments buf_fix_count. */ ut_a(bpage); - ret = !buf_pool_watch_is(bpage); - buf_pool_mutex_exit(); + ret = !buf_pool_watch_is_sentinel(buf_pool, bpage); + buf_pool_mutex_exit(buf_pool); return(ret); } @@ -1809,13 +2129,15 @@ buf_page_make_young( /*================*/ buf_page_t* bpage) /*!< in: buffer block of a file page */ { - buf_pool_mutex_enter(); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + buf_pool_mutex_enter(buf_pool); ut_a(buf_page_in_file(bpage)); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } /********************************************************************//** @@ -1833,18 +2155,20 @@ buf_page_set_accessed_make_young( read under mutex protection, or 0 if unknown */ { - ut_ad(!buf_pool_mutex_own()); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(!buf_pool_mutex_own(buf_pool)); ut_a(buf_page_in_file(bpage)); if (buf_page_peek_if_too_old(bpage)) { - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } else if (!access_time) { ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); buf_page_set_accessed(bpage, time_ms); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } } @@ -1859,17 +2183,18 @@ buf_reset_check_index_page_at_flush( ulint offset) /*!< in: page number */ { buf_block_t* block; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - block = (buf_block_t*) buf_page_hash_get(space, offset); + block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset); if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) { - ut_ad(!buf_pool_watch_is(&block->page)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); block->check_index_page_at_flush = FALSE; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } /********************************************************************//** @@ -1886,19 +2211,20 @@ buf_page_peek_if_search_hashed( { buf_block_t* block; ibool is_hashed; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - block = (buf_block_t*) buf_page_hash_get(space, offset); + block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset); if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { is_hashed = FALSE; } else { - ut_ad(!buf_pool_watch_is(&block->page)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); is_hashed = block->is_hashed; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(is_hashed); } @@ -1918,17 +2244,18 @@ buf_page_set_file_page_was_freed( ulint offset) /*!< in: page number */ { buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - bpage = buf_page_hash_get(space, offset); + bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage) { - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); bpage->file_page_was_freed = TRUE; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(bpage); } @@ -1947,17 +2274,18 @@ buf_page_reset_file_page_was_freed( ulint offset) /*!< in: page number */ { buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - bpage = buf_page_hash_get(space, offset); + bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage) { - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); bpage->file_page_was_freed = FALSE; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(bpage); } @@ -1984,6 +2312,7 @@ buf_page_get_zip( mutex_t* block_mutex; ibool must_read; unsigned access_time; + buf_pool_t* buf_pool = buf_pool_get(space, offset); #ifndef UNIV_LOG_DEBUG ut_ad(!ibuf_inside()); @@ -1991,17 +2320,17 @@ buf_page_get_zip( buf_pool->stat.n_page_gets++; for (;;) { - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); lookup: - bpage = buf_page_hash_get(space, offset); + bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage) { - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); break; } /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_read_page(space, zip_size, offset); @@ -2013,11 +2342,11 @@ lookup: if (UNIV_UNLIKELY(!bpage->zip.data)) { /* There is no compressed page. */ err_exit: - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(NULL); } - ut_ad(!buf_pool_watch_is(bpage)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_NOT_USED: @@ -2028,7 +2357,7 @@ err_exit: break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - block_mutex = &buf_pool_zip_mutex; + block_mutex = &buf_pool->zip_mutex; mutex_enter(block_mutex); bpage->buf_fix_count++; goto got_block; @@ -2056,7 +2385,7 @@ got_block: must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; access_time = buf_page_is_accessed(bpage); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); mutex_exit(block_mutex); @@ -2186,13 +2515,16 @@ buf_zip_decompress( #ifndef UNIV_HOTBACKUP /*******************************************************************//** -Gets the block to whose frame the pointer is pointing to. -@return pointer to block, never NULL */ +Gets the block to whose frame the pointer is pointing to if found +in this buffer pool instance. +@return pointer to block */ UNIV_INTERN buf_block_t* -buf_block_align( -/*============*/ - const byte* ptr) /*!< in: pointer to a frame */ +buf_block_align_instance( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer in which the block + resides */ + const byte* ptr) /*!< in: pointer to a frame */ { buf_chunk_t* chunk; ulint i; @@ -2218,7 +2550,7 @@ buf_block_align( ut_ad(block->frame == page_align(ptr)); #ifdef UNIV_DEBUG /* A thread that updates these fields must - hold buf_pool_mutex and block->mutex. Acquire + hold buf_pool->mutex and block->mutex. Acquire only the latter. */ mutex_enter(&block->mutex); @@ -2267,6 +2599,30 @@ buf_block_align( } } + return(NULL); +} + +/*******************************************************************//** +Gets the block to whose frame the pointer is pointing to. +@return pointer to block, never NULL */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + const byte* ptr) /*!< in: pointer to a frame */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_block_t* block; + + block = buf_block_align_instance( + buf_pool_from_array(i), ptr); + if (block) { + return(block); + } + } + /* The block should always be found. */ ut_error; return(NULL); @@ -2274,14 +2630,15 @@ buf_block_align( /********************************************************************//** Find out if a pointer belongs to a buf_block_t. It can be a pointer to -the buf_block_t itself or a member of it +the buf_block_t itself or a member of it. This functions checks one of +the buffer pool instances. @return TRUE if ptr belongs to a buf_block_t struct */ -UNIV_INTERN +static ibool -buf_pointer_is_block_field( -/*=======================*/ - const void* ptr) /*!< in: pointer not - dereferenced */ +buf_pointer_is_block_field_instance( +/*================================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* ptr) /*!< in: pointer not dereferenced */ { const buf_chunk_t* chunk = buf_pool->chunks; const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks; @@ -2302,23 +2659,49 @@ buf_pointer_is_block_field( } /********************************************************************//** +Find out if a pointer belongs to a buf_block_t. It can be a pointer to +the buf_block_t itself or a member of it +@return TRUE if ptr belongs to a buf_block_t struct */ +UNIV_INTERN +ibool +buf_pointer_is_block_field( +/*=======================*/ + const void* ptr) /*!< in: pointer not dereferenced */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + ibool found; + + found = buf_pointer_is_block_field_instance( + buf_pool_from_array(i), ptr); + if (found) { + return(TRUE); + } + } + + return(FALSE); +} + +/********************************************************************//** Find out if a buffer block was created by buf_chunk_init(). @return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */ static ibool buf_block_is_uncompressed( /*======================*/ - const buf_block_t* block) /*!< in: pointer to block, - not dereferenced */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const buf_block_t* block) /*!< in: pointer to block, + not dereferenced */ { - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) { /* The pointer should be aligned. */ return(FALSE); } - return(buf_pointer_is_block_field((void *)block)); + return(buf_pointer_is_block_field_instance(buf_pool, (void *)block)); } /********************************************************************//** @@ -2347,6 +2730,7 @@ buf_page_get_gen( ulint fix_type; ibool must_read; ulint retries = 0; + buf_pool_t* buf_pool = buf_pool_get(space, offset); ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); @@ -2367,7 +2751,7 @@ buf_page_get_gen( fold = buf_page_address_fold(space, offset); loop: block = guess; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); if (block) { /* If the guess is a compressed page descriptor that @@ -2378,7 +2762,7 @@ loop: the guess may be pointing to a buffer pool chunk that has been released when resizing the buffer pool. */ - if (!buf_block_is_uncompressed(block) + if (!buf_block_is_uncompressed(buf_pool, block) || offset != block->page.offset || space != block->page.space || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { @@ -2391,12 +2775,12 @@ loop: } if (block == NULL) { - block = (buf_block_t*) buf_page_hash_get_low(space, offset, - fold); + block = (buf_block_t*) buf_page_hash_get_low( + buf_pool, space, offset, fold); } loop2: - if (block && buf_pool_watch_is(&block->page)) { + if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) { block = NULL; } @@ -2413,7 +2797,7 @@ loop2: } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); if (mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH) { @@ -2461,7 +2845,7 @@ got_block: /* The page is being read to buffer pool, but we cannot wait around for the read to complete. */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(NULL); } @@ -2477,40 +2861,42 @@ got_block: case BUF_BLOCK_ZIP_DIRTY: bpage = &block->page; /* Protect bpage->buf_fix_count. */ - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); if (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by buf_page_init_for_read(). */ - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); wait_until_unfixed: /* The block is buffer-fixed or I/O-fixed. Try again later. */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); os_thread_sleep(WAIT_FOR_READ); - + goto loop; } /* Allocate an uncompressed page. */ - buf_pool_mutex_exit(); - mutex_exit(&buf_pool_zip_mutex); + buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->zip_mutex); - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(buf_pool, 0); ut_a(block); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); mutex_enter(&block->mutex); { - buf_page_t* hash_bpage - = buf_page_hash_get_low(space, offset, fold); + buf_page_t* hash_bpage; + + hash_bpage = buf_page_hash_get_low( + buf_pool, space, offset, fold); if (UNIV_UNLIKELY(bpage != hash_bpage)) { /* The buf_pool->page_hash was modified - while buf_pool_mutex was released. + while buf_pool->mutex was released. Free the block that was allocated. */ buf_LRU_block_free_non_file_page(block); @@ -2526,7 +2912,7 @@ wait_until_unfixed: || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) { /* The block was buffer-fixed or I/O-fixed - while buf_pool_mutex was not held by this thread. + while buf_pool->mutex was not held by this thread. Free the block that was allocated and try again. This should be extremely unlikely. */ @@ -2539,7 +2925,7 @@ wait_until_unfixed: /* Move the compressed page from bpage to block, and uncompress it. */ - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); buf_relocate(bpage, &block->page); buf_block_init_low(block); @@ -2574,15 +2960,15 @@ wait_until_unfixed: UNIV_MEM_INVALID(bpage, sizeof *bpage); mutex_exit(&block->mutex); - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); buf_pool->n_pend_unzip++; - buf_buddy_free(bpage, sizeof *bpage); + buf_buddy_free(buf_pool, bpage, sizeof *bpage); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* Decompress the page and apply buffered operations - while not holding buf_pool_mutex or block->mutex. */ + while not holding buf_pool->mutex or block->mutex. */ success = buf_zip_decompress(block, srv_use_checksums); if (UNIV_LIKELY(success && !recv_no_ibuf_operations)) { @@ -2591,7 +2977,7 @@ wait_until_unfixed: } /* Unfix and unlatch the block. */ - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); mutex_enter(&block->mutex); block->page.buf_fix_count--; buf_block_set_io_fix(block, BUF_IO_NONE); @@ -2601,7 +2987,7 @@ wait_until_unfixed: if (UNIV_UNLIKELY(!success)) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(NULL); } @@ -2629,7 +3015,7 @@ wait_until_unfixed: access_time = buf_page_is_accessed(&block->page); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_page_set_accessed_make_young(&block->page, access_time); @@ -2714,6 +3100,7 @@ buf_page_optimistic_get( ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ { + buf_pool_t* buf_pool; unsigned access_time; ibool success; ulint fix_type; @@ -2807,6 +3194,7 @@ buf_page_optimistic_get( ut_a(ibuf_count_get(buf_block_get_space(block), buf_block_get_page_no(block)) == 0); #endif + buf_pool = buf_pool_from_block(block); buf_pool->stat.n_page_gets++; return(TRUE); @@ -2828,6 +3216,7 @@ buf_page_get_known_nowait( ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ { + buf_pool_t* buf_pool; ibool success; ulint fix_type; @@ -2856,10 +3245,12 @@ buf_page_get_known_nowait( mutex_exit(&block->mutex); + buf_pool = buf_pool_from_block(block); + if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) { - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); buf_LRU_make_block_young(&block->page); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } else if (!buf_page_is_accessed(&block->page)) { /* Above, we do a dirty read on purpose, to avoid mutex contention. The field buf_page_t::access_time @@ -2867,9 +3258,9 @@ buf_page_get_known_nowait( field must be protected by mutex, however. */ ulint time_ms = ut_time_ms(); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); @@ -2931,22 +3322,23 @@ buf_page_try_get_func( buf_block_t* block; ibool success; ulint fix_type; + buf_pool_t* buf_pool = buf_pool_get(space_id, page_no); ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); - buf_pool_mutex_enter(); - block = buf_block_hash_get(space_id, page_no); + buf_pool_mutex_enter(buf_pool); + block = buf_block_hash_get(buf_pool, space_id, page_no); if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(NULL); } - ut_ad(!buf_pool_watch_is(&block->page)); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); mutex_enter(&block->mutex); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); @@ -3033,8 +3425,9 @@ buf_page_init( buf_block_t* block) /*!< in: block to init */ { buf_page_t* hash_page; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(&(block->mutex))); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); @@ -3052,21 +3445,22 @@ buf_page_init( buf_block_init_low(block); - block->lock_hash_val = lock_rec_hash(space, offset); + block->lock_hash_val = lock_rec_hash(space, offset); buf_page_init_low(&block->page); /* Insert into the hash table of file pages */ - hash_page = buf_page_hash_get_low(space, offset, fold); + hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold); if (UNIV_LIKELY(!hash_page)) { - } else if (UNIV_LIKELY(buf_pool_watch_is(hash_page))) { + } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) { /* Preserve the reference count. */ ulint buf_fix_count = hash_page->buf_fix_count; + ut_a(buf_fix_count > 0); block->page.buf_fix_count += buf_fix_count; - buf_pool_watch_remove(fold, hash_page); + buf_pool_watch_remove(buf_pool, fold, hash_page); } else { fprintf(stderr, "InnoDB: Error: page %lu %lu already found" @@ -3076,7 +3470,7 @@ buf_page_init( (const void*) hash_page, (const void*) block); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_print(); buf_LRU_print(); buf_validate(); @@ -3111,7 +3505,8 @@ buf_page_init_for_read( ulint space, /*!< in: space id */ ulint zip_size,/*!< in: compressed page size, or 0 */ ibool unzip, /*!< in: TRUE=request uncompressed page */ - ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong + ib_int64_t tablespace_version, + /*!< in: prevents reading from a wrong version of the tablespace in case we have done DISCARD + IMPORT */ ulint offset) /*!< in: page number */ @@ -3123,6 +3518,7 @@ buf_page_init_for_read( ulint fold; ibool lru = FALSE; void* data; + buf_pool_t* buf_pool = buf_pool_get(space, offset); ut_ad(buf_pool); @@ -3151,16 +3547,17 @@ buf_page_init_for_read( && UNIV_LIKELY(!recv_recovery_is_on())) { block = NULL; } else { - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(buf_pool, 0); ut_ad(block); + ut_ad(buf_pool_from_block(block) == buf_pool); } fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - watch_page = buf_page_hash_get_low(space, offset, fold); - if (watch_page && !buf_pool_watch_is(watch_page)) { + watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold); + if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) { /* The page is already in the buffer pool. */ watch_page = NULL; err_exit: @@ -3187,6 +3584,8 @@ err_exit: bpage = &block->page; mutex_enter(&block->mutex); + ut_ad(buf_pool_from_bpage(bpage) == buf_pool); + buf_page_init(space, offset, fold, block); /* The block must be put to the LRU list, to the old blocks */ @@ -3207,16 +3606,16 @@ err_exit: if (UNIV_UNLIKELY(zip_size)) { page_zip_set_size(&block->page.zip, zip_size); - /* buf_pool_mutex may be released and + /* buf_pool->mutex may be released and reacquired by buf_buddy_alloc(). Thus, we must release block->mutex in order not to break the latching order in the reacquisition - of buf_pool_mutex. We also must defer this + of buf_pool->mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ mutex_exit(&block->mutex); - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3240,21 +3639,28 @@ err_exit: control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ - data = buf_buddy_alloc(zip_size, &lru); - bpage = buf_buddy_alloc(sizeof *bpage, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); + bpage = buf_buddy_alloc(buf_pool, sizeof *bpage, &lru); + + /* Initialize the buf_pool pointer. */ + bpage->buf_pool = buf_pool; /* If buf_buddy_alloc() allocated storage from the LRU list, - it released and reacquired buf_pool_mutex. Thus, we must + it released and reacquired buf_pool->mutex. Thus, we must check the page_hash again, as it may have been modified. */ if (UNIV_UNLIKELY(lru)) { - watch_page = buf_page_hash_get_low(space, offset, fold); - if (UNIV_UNLIKELY - (watch_page && !buf_pool_watch_is(watch_page))) { + + watch_page = buf_page_hash_get_low( + buf_pool, space, offset, fold); + + if (watch_page + && !buf_pool_watch_is_sentinel(buf_pool, + watch_page)) { /* The block was added by some other thread. */ watch_page = NULL; - buf_buddy_free(bpage, sizeof *bpage); - buf_buddy_free(data, zip_size); + buf_buddy_free(buf_pool, bpage, sizeof *bpage); + buf_buddy_free(buf_pool, data, zip_size); bpage = NULL; goto func_exit; @@ -3265,7 +3671,7 @@ err_exit: page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = data; - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); UNIV_MEM_DESC(bpage->zip.data, page_zip_get_size(&bpage->zip), bpage); @@ -3291,8 +3697,8 @@ err_exit: ulint buf_fix_count = watch_page->buf_fix_count; ut_a(buf_fix_count > 0); bpage->buf_fix_count += buf_fix_count; - ut_ad(buf_pool_watch_is(watch_page)); - buf_pool_watch_remove(fold, watch_page); + ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page)); + buf_pool_watch_remove(buf_pool, fold, watch_page); } HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, @@ -3304,12 +3710,12 @@ err_exit: buf_page_set_io_fix(bpage, BUF_IO_READ); - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); } buf_pool->n_pend_reads++; func_exit: - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -3338,24 +3744,27 @@ buf_page_create( { buf_frame_t* frame; buf_block_t* block; + ulint fold; buf_block_t* free_block = NULL; ulint time_ms = ut_time_ms(); - ulint fold; + buf_pool_t* buf_pool = buf_pool_get(space, offset); ut_ad(mtr); ut_ad(mtr->state == MTR_ACTIVE); ut_ad(space || !zip_size); - free_block = buf_LRU_get_free_block(0); + free_block = buf_LRU_get_free_block(buf_pool, 0); fold = buf_page_address_fold(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - block = (buf_block_t*) buf_page_hash_get_low(space, offset, fold); + block = (buf_block_t*) buf_page_hash_get_low( + buf_pool, space, offset, fold); - if (block && buf_page_in_file(&block->page) - && !buf_pool_watch_is(&block->page)) { + if (block + && buf_page_in_file(&block->page) + && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) { #ifdef UNIV_IBUF_COUNT_DEBUG ut_a(ibuf_count_get(space, offset) == 0); #endif @@ -3364,7 +3773,7 @@ buf_page_create( #endif /* UNIV_DEBUG_FILE_ACCESSES */ /* Page can be found in buf_pool */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_block_free(free_block); @@ -3398,7 +3807,7 @@ buf_page_create( ibool lru; /* Prevent race conditions during buf_buddy_alloc(), - which may release and reacquire buf_pool_mutex, + which may release and reacquire buf_pool->mutex, by IO-fixing and X-latching the block. */ buf_page_set_io_fix(&block->page, BUF_IO_READ); @@ -3406,13 +3815,13 @@ buf_page_create( page_zip_set_size(&block->page.zip, zip_size); mutex_exit(&block->mutex); - /* buf_pool_mutex may be released and reacquired by + /* buf_pool->mutex may be released and reacquired by buf_buddy_alloc(). Thus, we must release block->mutex in order not to break the latching order in - the reacquisition of buf_pool_mutex. We also must + the reacquisition of buf_pool->mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3430,7 +3839,7 @@ buf_page_create( buf_page_set_accessed(&block->page, time_ms); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -3442,7 +3851,7 @@ buf_page_create( ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(buf_pool); frame = block->frame; @@ -3478,6 +3887,7 @@ buf_page_io_complete( buf_page_t* bpage) /*!< in: pointer to the block in question */ { enum buf_io_fix io_type; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); const ibool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); @@ -3613,7 +4023,7 @@ corrupt: } } - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); mutex_enter(buf_page_get_mutex(bpage)); #ifdef UNIV_IBUF_COUNT_DEBUG @@ -3677,22 +4087,57 @@ corrupt: #endif /* UNIV_DEBUG */ mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } /*********************************************************************//** -Invalidates the file pages in the buffer pool when an archive recovery is -completed. All the file pages buffered must be in a replaceable state when -this function is called: not latched and not modified. */ -UNIV_INTERN +Asserts that all file pages in the buffer are in a replaceable state. +@return TRUE */ +static +ibool +buf_all_freed_instance( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instancce */ +{ + ulint i; + buf_chunk_t* chunk; + + ut_ad(buf_pool); + + buf_pool_mutex_enter(buf_pool); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + + const buf_block_t* block = buf_chunk_not_freed(chunk); + + if (UNIV_LIKELY_NULL(block)) { + fprintf(stderr, + "Page %lu %lu still fixed or dirty\n", + (ulong) block->page.space, + (ulong) block->page.offset); + ut_error; + } + } + + buf_pool_mutex_exit(buf_pool); + + return(TRUE); +} + +/*********************************************************************//** +Invalidates file pages in one buffer pool instance */ +static void -buf_pool_invalidate(void) -/*=====================*/ +buf_pool_invalidate_instance( +/*=========================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { ibool freed; enum buf_flush i; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { @@ -3708,23 +4153,23 @@ buf_pool_invalidate(void) pool invalidation to proceed we must ensure there is NO write activity happening. */ if (buf_pool->n_flush[i] > 0) { - buf_pool_mutex_exit(); - buf_flush_wait_batch_end(i); - buf_pool_mutex_enter(); + buf_pool_mutex_exit(buf_pool); + buf_flush_wait_batch_end(buf_pool, i); + buf_pool_mutex_enter(buf_pool); } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); - ut_ad(buf_all_freed()); + ut_ad(buf_all_freed_instance(buf_pool)); freed = TRUE; while (freed) { - freed = buf_LRU_search_and_free_block(100); + freed = buf_LRU_search_and_free_block(buf_pool, 100); } - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); @@ -3735,19 +4180,36 @@ buf_pool_invalidate(void) buf_pool->LRU_flush_ended = 0; memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat)); - buf_refresh_io_stats(); + buf_refresh_io_stats(buf_pool); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); +} + +/*********************************************************************//** +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_invalidate_instance(buf_pool_from_array(i)); + } } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /*********************************************************************//** -Validates the buffer buf_pool data structure. +Validates data in one buffer pool instance @return TRUE */ -UNIV_INTERN +static ibool -buf_validate(void) -/*==============*/ +buf_pool_validate_instance( +/*=======================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { buf_page_t* b; buf_chunk_t* chunk; @@ -3762,7 +4224,7 @@ buf_validate(void) ut_ad(buf_pool); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); chunk = buf_pool->chunks; @@ -3787,7 +4249,8 @@ buf_validate(void) break; case BUF_BLOCK_FILE_PAGE: - ut_a(buf_page_hash_get(buf_block_get_space( + ut_a(buf_page_hash_get(buf_pool, + buf_block_get_space( block), buf_block_get_page_no( block)) @@ -3851,7 +4314,7 @@ buf_validate(void) } } - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); /* Check clean compressed-only blocks. */ @@ -3874,10 +4337,10 @@ buf_validate(void) } /* It is OK to read oldest_modification here because - we have acquired buf_pool_zip_mutex above which acts + we have acquired buf_pool->zip_mutex above which acts as the 'block->mutex' for these bpages. */ ut_a(!b->oldest_modification); - ut_a(buf_page_hash_get(b->space, b->offset) == b); + ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b); n_lru++; n_zip++; @@ -3885,7 +4348,7 @@ buf_validate(void) /* Check dirty blocks. */ - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; b = UT_LIST_GET_NEXT(list, b)) { ut_ad(b->in_flush_list); @@ -3929,14 +4392,14 @@ buf_validate(void) ut_error; break; } - ut_a(buf_page_hash_get(b->space, b->offset) == b); + ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b); } ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); if (n_lru + n_free > buf_pool->curr_size + n_zip) { fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n", @@ -3957,22 +4420,44 @@ buf_validate(void) ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); ut_a(buf_LRU_validate()); - ut_a(buf_flush_validate()); + ut_a(buf_flush_validate(buf_pool)); + + return(TRUE); +} + +/*********************************************************************//** +Validates the buffer buf_pool data structure. +@return TRUE */ +UNIV_INTERN +ibool +buf_validate(void) +/*==============*/ +{ + ulint i; + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_pool_validate_instance(buf_pool); + } return(TRUE); } + #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /*********************************************************************//** -Prints info of the buffer buf_pool data structure. */ -UNIV_INTERN +Prints info of the buffer buf_pool data structure for one instance. */ +static void -buf_print(void) -/*===========*/ +buf_print_instance( +/*===============*/ + buf_pool_t* buf_pool) { dulint* index_ids; ulint* counts; @@ -3991,8 +4476,8 @@ buf_print(void) index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - buf_pool_mutex_enter(); - buf_flush_list_mutex_enter(); + buf_pool_mutex_enter(buf_pool); + buf_flush_list_mutex_enter(buf_pool); fprintf(stderr, "buf_pool size %lu\n" @@ -4019,7 +4504,7 @@ buf_print(void) (ulong) buf_pool->stat.n_pages_created, (ulong) buf_pool->stat.n_pages_written); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); /* Count the number of blocks belonging to each index in the buffer */ @@ -4061,7 +4546,7 @@ buf_print(void) } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -4082,7 +4567,24 @@ buf_print(void) mem_free(index_ids); mem_free(counts); - ut_a(buf_validate()); + ut_a(buf_pool_validate_instance(buf_pool)); +} + +/*********************************************************************//** +Prints info of the buffer buf_pool data structure. */ +UNIV_INTERN +void +buf_print(void) +/*===========*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_print_instance(buf_pool); + } } #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -4092,15 +4594,16 @@ Returns the number of latched pages in the buffer pool. @return number of latched pages */ UNIV_INTERN ulint -buf_get_latched_pages_number(void) -/*==============================*/ +buf_get_latched_pages_number_instance( +/*==================================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { - buf_chunk_t* chunk; buf_page_t* b; ulint i; + buf_chunk_t* chunk; ulint fixed_pages_number = 0; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); chunk = buf_pool->chunks; @@ -4129,7 +4632,7 @@ buf_get_latched_pages_number(void) } } - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); /* Traverse the lists of clean and dirty compressed-only blocks. */ @@ -4144,7 +4647,7 @@ buf_get_latched_pages_number(void) } } - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; b = UT_LIST_GET_NEXT(list, b)) { ut_ad(b->in_flush_list); @@ -4170,12 +4673,36 @@ buf_get_latched_pages_number(void) } } - buf_flush_list_mutex_exit(); - mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); + mutex_exit(&buf_pool->zip_mutex); + buf_pool_mutex_exit(buf_pool); return(fixed_pages_number); } + +/*********************************************************************//** +Returns the number of latched pages in all the buffer pools. +@return number of latched pages */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void) +/*==============================*/ +{ + ulint i; + ulint total_latched_pages = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + total_latched_pages += buf_get_latched_pages_number_instance( + buf_pool); + } + + return(total_latched_pages); +} + #endif /* UNIV_DEBUG */ /*********************************************************************//** @@ -4186,10 +4713,22 @@ ulint buf_get_n_pending_ios(void) /*=======================*/ { - return(buf_pool->n_pend_reads - + buf_pool->n_flush[BUF_FLUSH_LRU] - + buf_pool->n_flush[BUF_FLUSH_LIST] - + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + ulint i; + ulint pend_ios = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + pend_ios += + buf_pool->n_pend_reads + + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]; + } + + return(pend_ios); } /*********************************************************************//** @@ -4201,13 +4740,15 @@ ulint buf_get_modified_ratio_pct(void) /*============================*/ { - ulint ratio; + ulint ratio; + ulint lru_len = 0; + ulint free_len = 0; + ulint flush_list_len = 0; - /* This is for heuristics. No need to grab any mutex here. */ - ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) - / (1 + UT_LIST_GET_LEN(buf_pool->LRU) - + UT_LIST_GET_LEN(buf_pool->free)); + buf_get_total_list_len(&lru_len, &free_len, &flush_list_len); + ratio = (100 * flush_list_len) / (1 + lru_len + free_len); + /* 1 + is there to avoid division by zero */ return(ratio); @@ -4217,9 +4758,10 @@ buf_get_modified_ratio_pct(void) Prints info of the buffer i/o. */ UNIV_INTERN void -buf_print_io( -/*=========*/ - FILE* file) /*!< in/out: buffer where to print */ +buf_print_io_instance( +/*==================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + FILE* file) /*!< in/out: buffer where to print */ { time_t current_time; double time_elapsed; @@ -4227,8 +4769,8 @@ buf_print_io( ut_ad(buf_pool); - buf_pool_mutex_enter(); - buf_flush_list_mutex_enter(); + buf_pool_mutex_enter(buf_pool); + buf_flush_list_mutex_enter(buf_pool); fprintf(file, "Buffer pool size %lu\n" @@ -4250,7 +4792,7 @@ buf_print_io( + buf_pool->init_flush[BUF_FLUSH_LIST], (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, @@ -4282,7 +4824,8 @@ buf_print_io( - buf_pool->old_stat.n_pages_written) / time_elapsed); - n_gets_diff = buf_pool->stat.n_page_gets - buf_pool->old_stat.n_page_gets; + n_gets_diff = buf_pool->stat.n_page_gets + - buf_pool->old_stat.n_page_gets; if (n_gets_diff) { fprintf(file, @@ -4326,56 +4869,81 @@ buf_print_io( buf_LRU_stat_sum.io, buf_LRU_stat_cur.io, buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip); - buf_refresh_io_stats(); - buf_pool_mutex_exit(); + buf_refresh_io_stats(buf_pool); + buf_pool_mutex_exit(buf_pool); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_print_io_instance(buf_pool, file); + } } /**********************************************************************//** Refreshes the statistics used to print per-second averages. */ UNIV_INTERN void -buf_refresh_io_stats(void) -/*======================*/ +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { - buf_pool->last_printout_time = time(NULL); + buf_pool->last_printout_time = ut_time(); buf_pool->old_stat = buf_pool->stat; } -/*********************************************************************//** -Asserts that all file pages in the buffer are in a replaceable state. -@return TRUE */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ UNIV_INTERN -ibool -buf_all_freed(void) -/*===============*/ +void +buf_refresh_io_stats_all(void) +/*==========================*/ { - buf_chunk_t* chunk; ulint i; - ut_ad(buf_pool); + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; - buf_pool_mutex_enter(); + buf_pool = buf_pool_from_array(i); - chunk = buf_pool->chunks; + buf_refresh_io_stats(buf_pool); + } +} - for (i = buf_pool->n_chunks; i--; chunk++) { +/**********************************************************************//** +Check if all pages in all buffer pools are in a replacable state. +@return FALSE if not */ +UNIV_INTERN +ibool +buf_all_freed(void) +/*===============*/ +{ + ulint i; - const buf_block_t* block = buf_chunk_not_freed(chunk); + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; - if (UNIV_LIKELY_NULL(block)) { - fprintf(stderr, - "Page %lu %lu still fixed or dirty\n", - (ulong) block->page.space, - (ulong) block->page.offset); - ut_error; - } - } + buf_pool = buf_pool_from_array(i); - buf_pool_mutex_exit(); + if (!buf_all_freed_instance(buf_pool)) { + return(FALSE); + } + } return(TRUE); } - + /*********************************************************************//** Checks that there currently are no pending i/o-operations for the buffer pool. @@ -4385,23 +4953,32 @@ ibool buf_pool_check_no_pending_io(void) /*==============================*/ { - ibool ret; + ulint i; + ibool ret = TRUE; - buf_pool_mutex_enter(); + buf_pool_mutex_enter_all(); - if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] - + buf_pool->n_flush[BUF_FLUSH_LIST] - + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { - ret = FALSE; - } else { - ret = TRUE; + for (i = 0; i < srv_buf_pool_instances && ret; i++) { + const buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + if (buf_pool->n_pend_reads + + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { + + ret = FALSE; + } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit_all(); return(ret); } +#if 0 +Code currently not used /*********************************************************************//** Gets the current length of the free list of buffer blocks. @return length of the free list */ @@ -4412,14 +4989,16 @@ buf_get_free_list_len(void) { ulint len; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); len = UT_LIST_GET_LEN(buf_pool->free); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(len); } +#endif + #else /* !UNIV_HOTBACKUP */ /********************************************************************//** Inits a page to the buffer buf_pool, for use in ibbackup --restore. */ diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c index 847f8dd9452..bb126a35867 100644 --- a/storage/innobase/buf/buf0flu.c +++ b/storage/innobase/buf/buf0flu.c @@ -83,8 +83,9 @@ Validates the flush list. @return TRUE if ok */ static ibool -buf_flush_validate_low(void); -/*========================*/ +buf_flush_validate_low( +/*===================*/ + buf_pool_t* buf_pool); /*!< in: Buffer pool instance */ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /******************************************************************//** @@ -98,11 +99,12 @@ buf_flush_insert_in_flush_rbt( /*==========================*/ buf_page_t* bpage) /*!< in: bpage to be inserted. */ { - buf_page_t* prev = NULL; const ib_rbt_node_t* c_node; const ib_rbt_node_t* p_node; + buf_page_t* prev = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_flush_list_mutex_own()); + ut_ad(buf_flush_list_mutex_own(buf_pool)); /* Insert this buffer into the rbt. */ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); @@ -127,10 +129,10 @@ buf_flush_delete_from_flush_rbt( /*============================*/ buf_page_t* bpage) /*!< in: bpage to be removed. */ { + ibool ret = FALSE; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ibool ret = FALSE; - - ut_ad(buf_flush_list_mutex_own()); + ut_ad(buf_flush_list_mutex_own(buf_pool)); ret = rbt_delete(buf_pool->flush_rbt, &bpage); ut_ad(ret); @@ -156,22 +158,21 @@ buf_flush_block_cmp( int ret; const buf_page_t* b1 = *(const buf_page_t**) p1; const buf_page_t* b2 = *(const buf_page_t**) p2; +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(b1); +#endif /* UNIV_DEBUG */ ut_ad(b1 != NULL); ut_ad(b2 != NULL); - ut_ad(buf_flush_list_mutex_own()); + ut_ad(buf_flush_list_mutex_own(buf_pool)); ut_ad(b1->in_flush_list); ut_ad(b2->in_flush_list); - if (b2->oldest_modification - > b1->oldest_modification) { + if (b2->oldest_modification > b1->oldest_modification) { return(1); - } - - if (b2->oldest_modification - < b1->oldest_modification) { + } else if (b2->oldest_modification < b1->oldest_modification) { return(-1); } @@ -191,12 +192,21 @@ void buf_flush_init_flush_rbt(void) /*==========================*/ { - buf_flush_list_mutex_enter(); + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); - /* Create red black tree for speedy insertions in flush list. */ - buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*), - buf_flush_block_cmp); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_enter(buf_pool); + + /* Create red black tree for speedy insertions in flush list. */ + buf_pool->flush_rbt = rbt_create( + sizeof(buf_page_t*), buf_flush_block_cmp); + + buf_flush_list_mutex_exit(buf_pool); + } } /********************************************************************//** @@ -206,16 +216,24 @@ void buf_flush_free_flush_rbt(void) /*==========================*/ { - buf_flush_list_mutex_enter(); + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low()); + ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - rbt_free(buf_pool->flush_rbt); - buf_pool->flush_rbt = NULL; + rbt_free(buf_pool->flush_rbt); + buf_pool->flush_rbt = NULL; - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); + } } /********************************************************************//** @@ -224,14 +242,15 @@ UNIV_INTERN void buf_flush_insert_into_flush_list( /*=============================*/ - buf_block_t* block, /*!< in/out: block which is modified */ - ib_uint64_t lsn) /*!< in: oldest modification */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + ib_uint64_t lsn) /*!< in: oldest modification */ { - ut_ad(!buf_pool_mutex_own()); - ut_ad(buf_flush_order_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); ut_ad(mutex_own(&block->mutex)); - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification @@ -240,8 +259,8 @@ buf_flush_insert_into_flush_list( /* If we are in the recovery then we need to update the flush red-black tree as well. */ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { - buf_flush_list_mutex_exit(); - buf_flush_insert_sorted_into_flush_list(block, lsn); + buf_flush_list_mutex_exit(buf_pool); + buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn); return; } @@ -253,10 +272,10 @@ buf_flush_insert_into_flush_list( UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low()); + ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); } /********************************************************************//** @@ -267,18 +286,19 @@ UNIV_INTERN void buf_flush_insert_sorted_into_flush_list( /*====================================*/ - buf_block_t* block, /*!< in/out: block which is modified */ - ib_uint64_t lsn) /*!< in: oldest modification */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + ib_uint64_t lsn) /*!< in: oldest modification */ { buf_page_t* prev_b; buf_page_t* b; - ut_ad(!buf_pool_mutex_own()); - ut_ad(buf_flush_order_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); ut_ad(mutex_own(&block->mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); /* The field in_LRU_list is protected by buf_pool_mutex, which we are not holding. However, while a block is in the flush @@ -332,10 +352,10 @@ buf_flush_insert_sorted_into_flush_list( } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low()); + ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); } /********************************************************************//** @@ -349,7 +369,10 @@ buf_flush_ready_for_replace( buf_page_t* bpage) /*!< in: buffer control block, must be buf_page_in_file(bpage) and in the LRU list */ { - ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(bpage->in_LRU_list); @@ -382,8 +405,11 @@ buf_flush_ready_for_flush( buf_page_in_file(bpage) */ enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST); @@ -416,15 +442,17 @@ buf_flush_remove( /*=============*/ buf_page_t* bpage) /*!< in: pointer to the block in question */ { - ut_ad(buf_pool_mutex_own()); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(bpage->in_flush_list); - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: - /* clean compressed pages should not be on the flush list */ + /* Clean compressed pages should not be on the flush list */ case BUF_BLOCK_ZIP_FREE: case BUF_BLOCK_NOT_USED: case BUF_BLOCK_READY_FOR_USE: @@ -442,7 +470,7 @@ buf_flush_remove( break; } - /* If the flush_rbt is active then delete from it as well. */ + /* If the flush_rbt is active then delete from there as well. */ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { buf_flush_delete_from_flush_rbt(bpage); } @@ -454,18 +482,18 @@ buf_flush_remove( bpage->oldest_modification = 0; #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low()); + ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); } /*******************************************************************//** Relocates a buffer control block on the flush_list. -Note that it is assumed that the contents of bpage has already been +Note that it is assumed that the contents of bpage have already been copied to dpage. IMPORTANT: When this function is called bpage and dpage are not -exact copy of each other. For example, they both will have different +exact copies of each other. For example, they both will have different ::state. Also the ::list pointers in dpage may be stale. We need to use the current list node (bpage) to do the list manipulation because the list pointers could have changed between the time that we copied @@ -478,17 +506,20 @@ buf_flush_relocate_on_flush_list( buf_page_t* bpage, /*!< in/out: control block being moved */ buf_page_t* dpage) /*!< in/out: destination block */ { - buf_page_t* prev; - buf_page_t* prev_b = NULL; + buf_page_t* prev; + buf_page_t* prev_b = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); + /* Must reside in the same buffer pool. */ + ut_ad(buf_pool == buf_pool_from_bpage(dpage)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); /* FIXME: At this point we have both buf_pool and flush_list - mutexes. Theoratically removal of a block from flush list is + mutexes. Theoretically removal of a block from flush list is only covered by flush_list mutex but currently we do have buf_pool mutex in buf_flush_remove() therefore this block is guaranteed to be in the flush list. We need to check if @@ -529,10 +560,10 @@ buf_flush_relocate_on_flush_list( ut_a(!buf_pool->flush_rbt || prev_b == prev); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_flush_validate_low()); + ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); } /********************************************************************//** @@ -544,6 +575,7 @@ buf_flush_write_complete( buf_page_t* bpage) /*!< in: pointer to the block in question */ { enum buf_flush flush_type; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(bpage); @@ -564,8 +596,8 @@ buf_flush_write_complete( /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[flush_type]); */ - if ((buf_pool->n_flush[flush_type] == 0) - && (buf_pool->init_flush[flush_type] == FALSE)) { + if (buf_pool->n_flush[flush_type] == 0 + && buf_pool->init_flush[flush_type] == FALSE) { /* The running flush batch has ended */ @@ -979,6 +1011,10 @@ buf_flush_write_block_low( /*======================*/ buf_page_t* bpage) /*!< in: buffer block to write */ { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(!buf_pool_mutex_own(buf_pool)); +#endif ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; #ifdef UNIV_LOG_DEBUG @@ -992,8 +1028,8 @@ buf_flush_write_block_low( io_fixed and oldest_modification != 0. Thus, it cannot be relocated in the buffer pool or removed from flush_list or LRU_list. */ - ut_ad(!buf_pool_mutex_own()); - ut_ad(!buf_flush_list_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); ut_ad(!mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); ut_ad(bpage->oldest_modification != 0); @@ -1062,13 +1098,14 @@ buf_flush_write_block_low( Writes a flushable page asynchronously from the buffer pool to a file. NOTE: in simulated aio we must call os_aio_simulated_wake_handler_threads after we have posted a batch of -writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be +writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be held upon entering this function, and they will be released by this function. */ static void buf_flush_page( /*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ buf_page_t* bpage, /*!< in: buffer control block */ enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ @@ -1077,7 +1114,7 @@ buf_flush_page( ibool is_uncompressed; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_in_file(bpage)); block_mutex = buf_page_get_mutex(bpage); @@ -1097,7 +1134,7 @@ buf_flush_page( buf_pool->n_flush[flush_type]++; is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); - ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex)); + ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex)); switch (flush_type) { ibool is_s_latched; @@ -1113,7 +1150,7 @@ buf_flush_page( } mutex_exit(block_mutex); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* Even though bpage is not protected by any mutex at this point, it is safe to access bpage, because it is @@ -1150,7 +1187,7 @@ buf_flush_page( immediately. */ mutex_exit(block_mutex); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); break; default: @@ -1184,13 +1221,13 @@ buf_flush_try_neighbors( enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { - buf_page_t* bpage; - ulint low, high; - ulint count = 0; ulint i; + ulint low; + ulint high; + ulint count = 0; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - ut_ad(flush_type == BUF_FLUSH_LRU - || flush_type == BUF_FLUSH_LIST); + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { /* If there is little space, it is better not to flush @@ -1203,8 +1240,11 @@ buf_flush_try_neighbors( neighborhoods of this size, and flushed along with the original page. */ - ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA, - buf_pool->curr_size / 16); + ulint buf_flush_area; + + buf_flush_area = ut_min( + BUF_READ_AHEAD_AREA(buf_pool), + buf_pool->curr_size / 16); low = (offset / buf_flush_area) * buf_flush_area; high = (offset / buf_flush_area + 1) * buf_flush_area; @@ -1216,14 +1256,20 @@ buf_flush_try_neighbors( high = fil_space_get_size(space); } - buf_pool_mutex_enter(); - for (i = low; i < high; i++) { - bpage = buf_page_hash_get(space, i); + buf_page_t* bpage; + + buf_pool = buf_pool_get(space, i); + + buf_pool_mutex_enter(buf_pool); + + /* We only want to flush pages from this buffer pool. */ + bpage = buf_page_hash_get(buf_pool, space, i); if (!bpage) { + buf_pool_mutex_exit(buf_pool); continue; } @@ -1250,19 +1296,18 @@ buf_flush_try_neighbors( doublewrite buffer before we start waiting. */ - buf_flush_page(bpage, flush_type); + buf_flush_page(buf_pool, bpage, flush_type); ut_ad(!mutex_own(block_mutex)); + ut_ad(!buf_pool_mutex_own(buf_pool)); count++; - - buf_pool_mutex_enter(); + continue; } else { mutex_exit(block_mutex); } } + buf_pool_mutex_exit(buf_pool); } - buf_pool_mutex_exit(); - return(count); } @@ -1285,10 +1330,13 @@ buf_flush_page_and_try_neighbors( ulint* count) /*!< in/out: number of pages flushed */ { - ibool flushed = FALSE; mutex_t* block_mutex; + ibool flushed = FALSE; +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif /* UNIV_DEBUG */ - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); @@ -1296,10 +1344,13 @@ buf_flush_page_and_try_neighbors( ut_a(buf_page_in_file(bpage)); if (buf_flush_ready_for_flush(bpage, flush_type)) { - ulint space; - ulint offset; + ulint space; + ulint offset; + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_bpage(bpage); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* These fields are protected by both the buffer pool mutex and block mutex. */ @@ -1309,16 +1360,15 @@ buf_flush_page_and_try_neighbors( mutex_exit(block_mutex); /* Try to flush also all the neighbors */ - *count += buf_flush_try_neighbors(space, offset, - flush_type); + *count += buf_flush_try_neighbors(space, offset, flush_type); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); flushed = TRUE; } else { mutex_exit(block_mutex); } - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); return(flushed); } @@ -1333,12 +1383,13 @@ static ulint buf_flush_LRU_list_batch( /*=====================*/ - ulint max) /*!< in: max of blocks to flush */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint max) /*!< in: max of blocks to flush */ { buf_page_t* bpage; ulint count = 0; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); do { /* Start from the end of the list looking for a @@ -1360,7 +1411,7 @@ buf_flush_LRU_list_batch( should be flushed, we factor in this value. */ buf_lru_flush_page_count += count; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); return(count); } @@ -1375,6 +1426,7 @@ static ulint buf_flush_flush_list_batch( /*=======================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual @@ -1389,16 +1441,16 @@ buf_flush_flush_list_batch( buf_page_t* bpage; ulint count = 0; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); /* If we have flushed enough, leave the loop */ do { /* Start from the end of the list looking for a suitable block to be flushed. */ - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); - /* We use len here because theoratically insertions can + /* We use len here because theoretically insertions can happen in the flush_list below while we are traversing it for a suitable candidate for flushing. We'd like to set a limit on how farther we are willing to traverse @@ -1410,11 +1462,10 @@ buf_flush_flush_list_batch( ut_a(bpage->oldest_modification > 0); } - if (!bpage || bpage->oldest_modification >= lsn_limit) { /* We have flushed enough */ - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); break; } @@ -1422,7 +1473,7 @@ buf_flush_flush_list_batch( ut_ad(bpage->in_flush_list); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); /* The list may change during the flushing and we cannot safely preserve within this function a pointer to a @@ -1432,12 +1483,11 @@ buf_flush_flush_list_batch( && !buf_flush_page_and_try_neighbors( bpage, BUF_FLUSH_LIST, &count)) { - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); - /* If we are here that means that buf_pool - mutex was not released in - buf_flush_page_and_try_neighbors() above and - this guarantees that bpage didn't get + /* If we are here that means that buf_pool->mutex + was not released in buf_flush_page_and_try_neighbors() + above and this guarantees that bpage didn't get relocated since we released the flush_list mutex above. There is a chance, however, that the bpage got removed from flush_list (not @@ -1447,21 +1497,22 @@ buf_flush_flush_list_batch( the oldest_modification and if it is zero we start all over again. */ if (bpage->oldest_modification == 0) { - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); break; } + bpage = UT_LIST_GET_PREV(list, bpage); ut_ad(!bpage || bpage->in_flush_list); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); --len; } } while (count < min_n && bpage != NULL && len > 0); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); return(count); } @@ -1474,10 +1525,11 @@ end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued; ULINT_UNDEFINED if there was a flush of the same type already running */ -UNIV_INTERN +static ulint buf_flush_batch( /*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if BUF_FLUSH_LIST, then the caller must not own any @@ -1485,59 +1537,36 @@ buf_flush_batch( ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ - ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all - blocks whose oldest_modification is + ib_uint64_t lsn_limit) /*!< in: in the case of BUF_FLUSH_LIST + all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ { ulint count = 0; - ut_ad(flush_type == BUF_FLUSH_LRU - || flush_type == BUF_FLUSH_LIST); + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); #ifdef UNIV_SYNC_DEBUG ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(); - if (buf_pool->n_flush[flush_type] > 0 - || buf_pool->init_flush[flush_type] == TRUE) { - - /* There is already a flush batch of the same type running */ - - buf_pool_mutex_exit(); - - return(ULINT_UNDEFINED); - } - - buf_pool->init_flush[flush_type] = TRUE; + buf_pool_mutex_enter(buf_pool); /* Note: The buffer pool mutex is released and reacquired within the flush functions. */ switch(flush_type) { case BUF_FLUSH_LRU: - count = buf_flush_LRU_list_batch(min_n); + count = buf_flush_LRU_list_batch(buf_pool, min_n); break; case BUF_FLUSH_LIST: - count = buf_flush_flush_list_batch(min_n, lsn_limit); + count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit); break; default: ut_error; } - ut_ad(buf_pool_mutex_own()); - - buf_pool->init_flush[flush_type] = FALSE; - - if (buf_pool->n_flush[flush_type] == 0) { - - /* The running flush batch has ended */ - - os_event_set(buf_pool->no_flush[flush_type]); - } - - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_flush_buffered_writes(); @@ -1556,18 +1585,207 @@ buf_flush_batch( } /******************************************************************//** +Gather the aggregated stats for both flush list and LRU list flushing */ +static +void +buf_flush_common( +/*=============*/ + enum buf_flush flush_type, /*!< in: type of flush */ + ulint page_count) /*!< in: number of pages flushed */ +{ + buf_flush_buffered_writes(); + + ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && page_count > 0) { + fprintf(stderr, flush_type == BUF_FLUSH_LRU + ? "Flushed %lu pages in LRU flush\n" + : "Flushed %lu pages in flush list flush\n", + (ulong) page_count); + } +#endif /* UNIV_DEBUG */ + + srv_buf_pool_flushed += page_count; + + if (flush_type == BUF_FLUSH_LRU) { + /* We keep track of all flushes happening as part of LRU + flush. When estimating the desired rate at which flush_list + should be flushed we factor in this value. */ + buf_lru_flush_page_count += page_count; + } +} + +/******************************************************************//** +Start a buffer flush batch for LRU or flush list */ +static +ibool +buf_flush_start( +/*============*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + buf_pool_mutex_enter(buf_pool); + + if (buf_pool->n_flush[flush_type] > 0 + || buf_pool->init_flush[flush_type] == TRUE) { + + /* There is already a flush batch of the same type running */ + + buf_pool_mutex_exit(buf_pool); + + return(FALSE); + } + + buf_pool->init_flush[flush_type] = TRUE; + + buf_pool_mutex_exit(buf_pool); + + return(TRUE); +} + +/******************************************************************//** +End a buffer flush batch for LRU or flush list */ +static +void +buf_flush_end( +/*==========*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + buf_pool_mutex_enter(buf_pool); + + buf_pool->init_flush[flush_type] = FALSE; + + if (buf_pool->n_flush[flush_type] == 0) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } + + buf_pool_mutex_exit(buf_pool); +} + +/******************************************************************//** Waits until a flush batch of the given type ends */ UNIV_INTERN void buf_flush_wait_batch_end( /*=====================*/ - enum buf_flush type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + enum buf_flush type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ { - ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST)); + ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); + + if (buf_pool == NULL) { + ulint i; + + for (i = 0; i < srv_buf_pool_instances; ++i) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); - os_event_wait(buf_pool->no_flush[type]); + os_event_wait(buf_pool->no_flush[type]); + } + } else { + os_event_wait(buf_pool->no_flush[type]); + } } +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list. +NOTE: The calling thread may own latches to pages: to avoid deadlocks, +this function must be written so that it cannot end up waiting for these +latches! +@return number of blocks for which the write request was queued; +ULINT_UNDEFINED if there was a flush of the same type already running */ +UNIV_INTERN +ulint +buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint min_n) /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ +{ + ulint page_count; + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + return(ULINT_UNDEFINED); + } + + page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, page_count); + + return(page_count); +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush list of +all buffer pool instances. +NOTE: The calling thread is not allowed to own any latches on pages! +@return number of blocks for which the write request was queued; +ULINT_UNDEFINED if there was a flush of the same type already running */ +UNIV_INTERN +ulint +buf_flush_list( +/*===========*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ +{ + ulint i; + ulint total_page_count = 0; + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + + /* We use buffer pool instance 0 to control start and end of + flushing of the flush list since we always flush all instances + at once in this case. */ + + /* Flush to lsn_limit in all buffer pool instances */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + ulint page_count = 0; + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { + continue; + } + + page_count = buf_flush_batch( + buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + buf_flush_common(BUF_FLUSH_LIST, page_count); + + total_page_count += page_count; + } + + return(total_page_count); +} + /******************************************************************//** Gives a recommendation of how many blocks should be flushed to establish a big enough margin of replaceable blocks near the end of the LRU list @@ -1576,23 +1794,24 @@ and in the free list. LRU list */ static ulint -buf_flush_LRU_recommendation(void) -/*==============================*/ +buf_flush_LRU_recommendation( +/*=========================*/ + buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ { buf_page_t* bpage; ulint n_replaceable; ulint distance = 0; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); bpage = UT_LIST_GET_LAST(buf_pool->LRU); while ((bpage != NULL) - && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN - + BUF_FLUSH_EXTRA_MARGIN) - && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool) + + BUF_FLUSH_EXTRA_MARGIN(buf_pool)) + && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) { mutex_t* block_mutex = buf_page_get_mutex(bpage); @@ -1609,14 +1828,15 @@ buf_flush_LRU_recommendation(void) bpage = UT_LIST_GET_PREV(LRU, bpage); } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); - if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { + if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) { return(0); } - return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN + return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool) + + BUF_FLUSH_EXTRA_MARGIN(buf_pool) - n_replaceable); } @@ -1628,25 +1848,46 @@ flush only pages such that the s-lock required for flushing can be acquired immediately, without waiting. */ UNIV_INTERN void -buf_flush_free_margin(void) -/*=======================*/ +buf_flush_free_margin( +/*==================*/ + buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ { ulint n_to_flush; - ulint n_flushed; - n_to_flush = buf_flush_LRU_recommendation(); + n_to_flush = buf_flush_LRU_recommendation(buf_pool); if (n_to_flush > 0) { - n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0); + ulint n_flushed; + + n_flushed = buf_flush_LRU(buf_pool, n_to_flush); + if (n_flushed == ULINT_UNDEFINED) { /* There was an LRU type flush batch already running; let us wait for it to end */ - buf_flush_wait_batch_end(BUF_FLUSH_LRU); + buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); } } } +/*********************************************************************//** +Flushes pages from the end of all the LRU lists. */ +UNIV_INTERN +void +buf_flush_free_margins(void) +/*========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_free_margin(buf_pool); + } +} + /********************************************************************* Update the historical stats that we are collecting for flush rate heuristics at the end of each interval. @@ -1707,22 +1948,28 @@ ulint buf_flush_get_desired_flush_rate(void) /*==================================*/ { - ulint redo_avg; - ulint lru_flush_avg; - ulint n_dirty; - ulint n_flush_req; - lint rate; - ib_uint64_t lsn = log_get_lsn(); - ulint log_capacity = log_get_capacity(); + ulint i; + lint rate; + ulint redo_avg; + ulint n_dirty = 0; + ulint n_flush_req; + ulint lru_flush_avg; + ib_uint64_t lsn = log_get_lsn(); + ulint log_capacity = log_get_capacity(); /* log_capacity should never be zero after the initialization of log subsystem. */ ut_ad(log_capacity != 0); /* Get total number of dirty pages. It is OK to access - flush_list without holding any mtex as we are using this + flush_list without holding any mutex as we are using this only for heuristics. */ - n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list); + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list); + } /* An overflow can happen if we generate more than 2^32 bytes of redo in this interval i.e.: 4G of redo in 1 second. We can @@ -1764,13 +2011,14 @@ Validates the flush list. @return TRUE if ok */ static ibool -buf_flush_validate_low(void) -/*========================*/ +buf_flush_validate_low( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ { buf_page_t* bpage; const ib_rbt_node_t* rnode = NULL; - ut_ad(buf_flush_list_mutex_own()); + ut_ad(buf_flush_list_mutex_own(buf_pool)); UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, ut_ad(ut_list_node_313->in_flush_list)); @@ -1786,6 +2034,9 @@ buf_flush_validate_low(void) while (bpage != NULL) { const ib_uint64_t om = bpage->oldest_modification; + + ut_ad(buf_pool_from_bpage(bpage) == buf_pool); + ut_ad(bpage->in_flush_list); /* A page in flush_list can be in BUF_BLOCK_REMOVE_HASH @@ -1795,14 +2046,15 @@ buf_flush_validate_low(void) waiting to acquire the flush_list_mutex to complete the relocation. */ ut_a(buf_page_in_file(bpage) - || buf_page_get_state(bpage) - == BUF_BLOCK_REMOVE_HASH); + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); ut_a(om > 0); if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_page_t* rpage; + ut_a(rnode); - buf_page_t* rpage = *rbt_value(buf_page_t*, - rnode); + rpage = *rbt_value(buf_page_t*, rnode); + ut_a(rpage); ut_a(rpage == bpage); rnode = rbt_next(buf_pool->flush_rbt, rnode); @@ -1825,16 +2077,17 @@ Validates the flush list. @return TRUE if ok */ UNIV_INTERN ibool -buf_flush_validate(void) -/*====================*/ +buf_flush_validate( +/*===============*/ + buf_pool_t* buf_pool) /*!< buffer pool instance */ { ibool ret; - buf_flush_list_mutex_enter(); + buf_flush_list_mutex_enter(buf_pool); - ret = buf_flush_validate_low(); + ret = buf_flush_validate_low(buf_pool); - buf_flush_list_mutex_exit(); + buf_flush_list_mutex_exit(buf_pool); return(ret); } diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c index c7feb3ae79b..6a4c18aa86e 100644 --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -50,7 +50,7 @@ Created 11/5/1995 Heikki Tuuri #include "srv0srv.h" /** The number of blocks from the LRU_old pointer onward, including -the block pointed to, must be buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the whole LRU list length, except that the tolerance defined below is allowed. Note that the tolerance must be small enough such that for even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not @@ -96,8 +96,9 @@ with page_zip_decompress() operations. */ #define BUF_LRU_IO_TO_UNZIP_FACTOR 50 /** Sampled values buf_LRU_stat_cur. -Protected by buf_pool_mutex. Updated by buf_LRU_stat_update(). */ +Not protected by any mutex. Updated by buf_LRU_stat_update(). */ static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; + /** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ static ulint buf_LRU_stat_arr_ind; @@ -106,15 +107,12 @@ by buf_LRU_stat_update(). */ UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_cur; /** Running sum of past values of buf_LRU_stat_cur. -Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */ +Updated by buf_LRU_stat_update(). Not Protected by any mutex. */ UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_sum; /* @} */ /** @name Heuristics for detecting index scan @{ */ -/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for -"old" blocks. Protected by buf_pool_mutex. */ -UNIV_INTERN uint buf_LRU_old_ratio; /** Move blocks to "new" LRU list only if the first access was at least this many milliseconds ago. Not protected by any mutex or latch. */ UNIV_INTERN uint buf_LRU_old_threshold_ms; @@ -123,7 +121,7 @@ UNIV_INTERN uint buf_LRU_old_threshold_ms; /******************************************************************//** Takes a block out of the LRU list and page hash table. If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), -the object will be freed and buf_pool_zip_mutex will be released. +the object will be freed and buf_pool->zip_mutex will be released. If a compressed page or a compressed-only block descriptor is freed, other compressed pages or compressed-only block descriptors may be @@ -154,13 +152,14 @@ instead of the general LRU list. @return TRUE if should use unzip_LRU */ UNIV_INLINE ibool -buf_LRU_evict_from_unzip_LRU(void) -/*==============================*/ +buf_LRU_evict_from_unzip_LRU( +/*=========================*/ + buf_pool_t* buf_pool) { ulint io_avg; ulint unzip_avg; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); /* If the unzip_LRU list is empty, we can only use the LRU. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { @@ -228,7 +227,8 @@ static void buf_LRU_drop_page_hash_for_tablespace( /*==================================*/ - ulint id) /*!< in: space id */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint id) /*!< in: space id */ { buf_page_t* bpage; ulint* page_arr; @@ -243,9 +243,10 @@ buf_LRU_drop_page_hash_for_tablespace( return; } - page_arr = ut_malloc(sizeof(ulint) - * BUF_LRU_DROP_SEARCH_HASH_SIZE); - buf_pool_mutex_enter(); + page_arr = ut_malloc( + sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); + + buf_pool_mutex_enter(buf_pool); scan_again: num_entries = 0; @@ -283,14 +284,17 @@ scan_again: if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { goto next_page; } - /* Array full. We release the buf_pool_mutex to + + /* Array full. We release the buf_pool->mutex to obey the latching order. */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); + + buf_LRU_drop_page_hash_batch( + id, zip_size, page_arr, num_entries); - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, - num_entries); num_entries = 0; - buf_pool_mutex_enter(); + + buf_pool_mutex_enter(buf_pool); } else { mutex_exit(block_mutex); } @@ -315,7 +319,7 @@ next_page: } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); @@ -323,27 +327,21 @@ next_page: } /******************************************************************//** -Invalidates all pages belonging to a given tablespace when we are deleting -the data file(s) of that tablespace. */ -UNIV_INTERN +Invalidates all pages belonging to a given tablespace inside a specific +buffer pool instance when we are deleting the data file(s) of that +tablespace. */ +static void -buf_LRU_invalidate_tablespace( -/*==========================*/ - ulint id) /*!< in: space id */ +buf_LRU_invalidate_tablespace_buf_pool_instance( +/*============================================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id) /*!< in: space id */ { buf_page_t* bpage; ibool all_freed; - /* Before we attempt to drop pages one by one we first - attempt to drop page hash index entries in batches to make - it more efficient. The batching attempt is a best effort - attempt and does not guarantee that all pages hash entries - will be dropped. We get rid of remaining page hash entries - one by one below. */ - buf_LRU_drop_page_hash_for_tablespace(id); - scan_again: - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); all_freed = TRUE; @@ -417,7 +415,7 @@ scan_again: buf_pool_zip_mutex, it is not necessary to acquire further mutexes. */ - ut_ad(&buf_pool_zip_mutex + ut_ad(&buf_pool->zip_mutex == block_mutex); ut_ad(mutex_own(block_mutex)); prev_bpage_buf_fix = TRUE; @@ -431,7 +429,7 @@ scan_again: ulint page_no; ulint zip_size; - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); zip_size = buf_page_get_zip_size(bpage); page_no = buf_page_get_page_no(bpage); @@ -461,7 +459,7 @@ scan_again: /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() when it returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool_zip_mutex); + ut_ad(block_mutex == &buf_pool->zip_mutex); ut_ad(!mutex_own(block_mutex)); if (prev_bpage_buf_fix) { @@ -488,7 +486,7 @@ next_page_no_mutex: bpage = prev_bpage; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); if (!all_freed) { os_thread_sleep(20000); @@ -497,6 +495,32 @@ next_page_no_mutex: } } +/******************************************************************//** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. */ +UNIV_INTERN +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id) /*!< in: space id */ +{ + ulint i; + + /* Before we attempt to drop pages one by one we first + attempt to drop page hash index entries in batches to make + it more efficient. The batching attempt is a best effort + attempt and does not guarantee that all pages hash entries + will be dropped. We get rid of remaining page hash entries + one by one below. */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); + buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id); + } +} + /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -506,8 +530,9 @@ buf_LRU_insert_zip_clean( buf_page_t* bpage) /*!< in: pointer to the block in question */ { buf_page_t* b; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); /* Find the first successor of bpage in the LRU list @@ -537,16 +562,19 @@ UNIV_INLINE ibool buf_LRU_free_from_unzip_LRU_list( /*=============================*/ - ulint n_iterations) /*!< in: how many times this has been called - repeatedly without result: a high value means - that we should search farther; we will search - n_iterations / 5 of the unzip_LRU list, - or nothing if n_iterations >= 5 */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n_iterations) /*!< in: how many times this has + been called repeatedly without + result: a high value means that + we should search farther; we will + search n_iterations / 5 of the + unzip_LRU list, or nothing if + n_iterations >= 5 */ { buf_block_t* block; ulint distance; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); /* Theoratically it should be much easier to find a victim from unzip_LRU as we can choose even a dirty block (as we'll @@ -556,7 +584,7 @@ buf_LRU_free_from_unzip_LRU_list( if we have done five iterations so far. */ if (UNIV_UNLIKELY(n_iterations >= 5) - || !buf_LRU_evict_from_unzip_LRU()) { + || !buf_LRU_evict_from_unzip_LRU(buf_pool)) { return(FALSE); } @@ -608,7 +636,9 @@ UNIV_INLINE ibool buf_LRU_free_from_common_LRU_list( /*==============================*/ - ulint n_iterations) /*!< in: how many times this has been called + buf_pool_t* buf_pool, + ulint n_iterations) + /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; if n_iterations < 10, then we search @@ -618,7 +648,7 @@ buf_LRU_free_from_common_LRU_list( buf_page_t* bpage; ulint distance; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); distance = 100 + (n_iterations * buf_pool->curr_size) / 10; @@ -675,7 +705,10 @@ UNIV_INTERN ibool buf_LRU_search_and_free_block( /*==========================*/ - ulint n_iterations) /*!< in: how many times this has been called + buf_pool_t* buf_pool, + /*!< in: buffer pool instance */ + ulint n_iterations) + /*!< in: how many times this has been called repeatedly without result: a high value means that we should search farther; if n_iterations < 10, then we search @@ -686,12 +719,13 @@ buf_LRU_search_and_free_block( { ibool freed = FALSE; - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - freed = buf_LRU_free_from_unzip_LRU_list(n_iterations); + freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations); if (!freed) { - freed = buf_LRU_free_from_common_LRU_list(n_iterations); + freed = buf_LRU_free_from_common_LRU_list( + buf_pool, n_iterations); } if (!freed) { @@ -700,7 +734,7 @@ buf_LRU_search_and_free_block( buf_pool->LRU_flush_ended--; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(freed); } @@ -715,44 +749,64 @@ operations need new buffer blocks, and the i/o work done in flushing would be wasted. */ UNIV_INTERN void -buf_LRU_try_free_flushed_blocks(void) -/*=================================*/ +buf_LRU_try_free_flushed_blocks( +/*============================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { - buf_pool_mutex_enter(); - while (buf_pool->LRU_flush_ended > 0) { + if (buf_pool == NULL) { + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool = buf_pool_from_array(i); + buf_LRU_try_free_flushed_blocks(buf_pool); + } + } else { + buf_pool_mutex_enter(buf_pool); - buf_pool_mutex_exit(); + while (buf_pool->LRU_flush_ended > 0) { - buf_LRU_search_and_free_block(1); + buf_pool_mutex_exit(buf_pool); - buf_pool_mutex_enter(); - } + buf_LRU_search_and_free_block(buf_pool, 1); + + buf_pool_mutex_enter(buf_pool); + } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); + } } /******************************************************************//** -Returns TRUE if less than 25 % of the buffer pool is available. This can be -used in heuristics to prevent huge transactions eating up the whole buffer -pool for their locks. +Returns TRUE if less than 25 % of the buffer pool in any instance is +available. This can be used in heuristics to prevent huge transactions +eating up the whole buffer pool for their locks. @return TRUE if less than 25 % of buffer pool left */ UNIV_INTERN ibool buf_LRU_buf_pool_running_out(void) /*==============================*/ { - ibool ret = FALSE; + ulint i; + ibool ret = FALSE; - buf_pool_mutex_enter(); + for (i = 0; i < srv_buf_pool_instances && !ret; i++) { + buf_pool_t* buf_pool; - if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) - + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) { + buf_pool = buf_pool_from_array(i); - ret = TRUE; - } + buf_pool_mutex_enter(buf_pool); + + if (!recv_recovery_on + && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) + < buf_pool->curr_size / 4) { - buf_pool_mutex_exit(); + ret = TRUE; + } + + buf_pool_mutex_exit(buf_pool); + } return(ret); } @@ -763,16 +817,18 @@ free list. If it is empty, returns NULL. @return a free control block, or NULL if the buf_block->free list is empty */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_only(void) -/*=======================*/ +buf_LRU_get_free_only( +/*==================*/ + buf_pool_t* buf_pool) { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free); if (block) { + ut_ad(block->page.in_free_list); ut_d(block->page.in_free_list = FALSE); ut_ad(!block->page.in_flush_list); @@ -785,6 +841,8 @@ buf_LRU_get_free_only(void) buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); + ut_ad(buf_pool_from_block(block) == buf_pool); + mutex_exit(&block->mutex); } @@ -800,8 +858,9 @@ UNIV_INTERN buf_block_t* buf_LRU_get_free_block( /*===================*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint zip_size) /*!< in: compressed page size in bytes, + or 0 if uncompressed tablespace */ { buf_block_t* block = NULL; ibool freed; @@ -809,7 +868,7 @@ buf_LRU_get_free_block( ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { @@ -876,9 +935,11 @@ loop: } /* If there is a block in the free list, take it */ - block = buf_LRU_get_free_only(); + block = buf_LRU_get_free_only(buf_pool); if (block) { + ut_ad(buf_pool_from_block(block) == buf_pool); + #ifdef UNIV_DEBUG block->page.zip.m_start = #endif /* UNIV_DEBUG */ @@ -889,14 +950,17 @@ loop: if (UNIV_UNLIKELY(zip_size)) { ibool lru; page_zip_set_size(&block->page.zip, zip_size); - block->page.zip.data = buf_buddy_alloc(zip_size, &lru); + + block->page.zip.data = buf_buddy_alloc( + buf_pool, zip_size, &lru); + UNIV_MEM_DESC(block->page.zip.data, zip_size, block); } else { page_zip_set_size(&block->page.zip, 0); block->page.zip.data = NULL; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -908,9 +972,9 @@ loop: /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); - freed = buf_LRU_search_and_free_block(n_iterations); + freed = buf_LRU_search_and_free_block(buf_pool, n_iterations); if (freed > 0) { goto loop; @@ -952,23 +1016,23 @@ loop: /* No free block was found: try to flush the LRU list */ - buf_flush_free_margin(); + buf_flush_free_margin(buf_pool); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); - buf_LRU_try_free_flushed_blocks(); + buf_LRU_try_free_flushed_blocks(buf_pool); } else { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } if (n_iterations > 10) { @@ -986,16 +1050,17 @@ Moves the LRU_old pointer so that the length of the old blocks list is inside the allowed limits. */ UNIV_INLINE void -buf_LRU_old_adjust_len(void) -/*========================*/ +buf_LRU_old_adjust_len( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { ulint old_len; ulint new_len; ut_a(buf_pool->LRU_old); - ut_ad(buf_pool_mutex_own()); - ut_ad(buf_LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); - ut_ad(buf_LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); #if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) # error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)" #endif @@ -1011,7 +1076,7 @@ buf_LRU_old_adjust_len(void) old_len = buf_pool->LRU_old_len; new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) - * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + * buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, UT_LIST_GET_LEN(buf_pool->LRU) - (BUF_LRU_OLD_TOLERANCE + BUF_LRU_NON_OLD_MIN_LEN)); @@ -1053,12 +1118,13 @@ Initializes the old blocks pointer in the LRU list. This function should be called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ static void -buf_LRU_old_init(void) -/*==================*/ +buf_LRU_old_init( +/*=============*/ + buf_pool_t* buf_pool) { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use @@ -1077,7 +1143,7 @@ buf_LRU_old_init(void) buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU); buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU); - buf_LRU_old_adjust_len(); + buf_LRU_old_adjust_len(buf_pool); } /******************************************************************//** @@ -1088,10 +1154,12 @@ buf_unzip_LRU_remove_block_if_needed( /*=================================*/ buf_page_t* bpage) /*!< in/out: control block */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool); ut_ad(bpage); ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); if (buf_page_belongs_to_unzip_LRU(bpage)) { buf_block_t* block = (buf_block_t*) bpage; @@ -1111,9 +1179,11 @@ buf_LRU_remove_block( /*=================*/ buf_page_t* bpage) /*!< in: control block */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(buf_page_in_file(bpage)); @@ -1127,7 +1197,7 @@ buf_LRU_remove_block( /* Below: the previous block is guaranteed to exist, because the LRU_old pointer is only allowed to differ by BUF_LRU_OLD_TOLERANCE from strict - buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU list length. */ buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage); @@ -1173,7 +1243,7 @@ buf_LRU_remove_block( } /* Adjust the length of the old block list if necessary */ - buf_LRU_old_adjust_len(); + buf_LRU_old_adjust_len(buf_pool); } /******************************************************************//** @@ -1186,9 +1256,11 @@ buf_unzip_LRU_add_block( ibool old) /*!< in: TRUE if should be put to the end of the list, else put to the start */ { + buf_pool_t* buf_pool = buf_pool_from_block(block); + ut_ad(buf_pool); ut_ad(block); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); @@ -1210,9 +1282,11 @@ buf_LRU_add_block_to_end_low( /*=========================*/ buf_page_t* bpage) /*!< in: control block */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(buf_page_in_file(bpage)); @@ -1228,14 +1302,14 @@ buf_LRU_add_block_to_end_low( buf_page_set_old(bpage, TRUE); buf_pool->LRU_old_len++; - buf_LRU_old_adjust_len(); + buf_LRU_old_adjust_len(buf_pool); } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { /* The LRU list is now long enough for LRU_old to become defined: init it */ - buf_LRU_old_init(); + buf_LRU_old_init(buf_pool); } else { buf_page_set_old(bpage, buf_pool->LRU_old != NULL); } @@ -1259,9 +1333,11 @@ buf_LRU_add_block_low( LRU list is very short, the block is added to the start, regardless of this parameter */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(buf_page_in_file(bpage)); ut_ad(!bpage->in_LRU_list); @@ -1295,14 +1371,14 @@ buf_LRU_add_block_low( /* Adjust the length of the old block list if necessary */ buf_page_set_old(bpage, old); - buf_LRU_old_adjust_len(); + buf_LRU_old_adjust_len(buf_pool); } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { /* The LRU list is now long enough for LRU_old to become defined: init it */ - buf_LRU_old_init(); + buf_LRU_old_init(buf_pool); } else { buf_page_set_old(bpage, buf_pool->LRU_old != NULL); } @@ -1338,7 +1414,9 @@ buf_LRU_make_block_young( /*=====================*/ buf_page_t* bpage) /*!< in: control block */ { - ut_ad(buf_pool_mutex_own()); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_pool_mutex_own(buf_pool)); if (bpage->old) { buf_pool->stat.n_pages_made_young++; @@ -1365,10 +1443,10 @@ Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. NOTE: If this function returns BUF_LRU_FREED, it will not temporarily -release buf_pool_mutex. Furthermore, the page frame will no longer be +release buf_pool->mutex. Furthermore, the page frame will no longer be accessible via bpage. -The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and +The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. @return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or @@ -1382,13 +1460,14 @@ buf_LRU_free_block( compressed page of an uncompressed page */ ibool* buf_pool_mutex_released) /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex + be assigned TRUE if buf_pool->mutex was temporarily released, or NULL */ { buf_page_t* b = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); mutex_t* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); @@ -1427,9 +1506,9 @@ buf_LRU_free_block( If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - buf_pool_mutex_exit_forbid(); - b = buf_buddy_alloc(sizeof *b, NULL); - buf_pool_mutex_exit_allow(); + buf_pool_mutex_exit_forbid(buf_pool); + b = buf_buddy_alloc(buf_pool, sizeof *b, NULL); + buf_pool_mutex_exit_allow(buf_pool); if (UNIV_UNLIKELY(!b)) { return(BUF_LRU_CANNOT_RELOCATE); @@ -1451,11 +1530,14 @@ alloc: ut_a(bpage->buf_fix_count == 0); if (b) { + buf_page_t* hash_b; buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); - const ulint fold = buf_page_address_fold( + + const ulint fold = buf_page_address_fold( bpage->space, bpage->offset); - buf_page_t* hash_b = buf_page_hash_get_low( - bpage->space, bpage->offset, fold); + + hash_b = buf_page_hash_get_low( + buf_pool, bpage->space, bpage->offset, fold); ut_a(!hash_b); @@ -1512,12 +1594,12 @@ alloc: ut_ad(buf_pool->LRU_old); /* Adjust the length of the old block list if necessary */ - buf_LRU_old_adjust_len(); + buf_LRU_old_adjust_len(buf_pool); } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { /* The LRU list is now long enough for LRU_old to become defined: init it */ - buf_LRU_old_init(); + buf_LRU_old_init(buf_pool); } #ifdef UNIV_LRU_DEBUG /* Check that the "old" flag is consistent @@ -1541,7 +1623,7 @@ alloc: /* Prevent buf_page_get_gen() from decompressing the block while we release - buf_pool_mutex and block_mutex. */ + buf_pool->mutex and block_mutex. */ b->buf_fix_count++; b->io_fix = BUF_IO_READ; } @@ -1550,7 +1632,7 @@ alloc: *buf_pool_mutex_released = TRUE; } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); mutex_exit(block_mutex); /* Remove possible adaptive hash index on the page. @@ -1582,14 +1664,14 @@ alloc: : BUF_NO_CHECKSUM_MAGIC); } - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); mutex_enter(block_mutex); if (b) { - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); b->buf_fix_count--; buf_page_set_io_fix(b, BUF_IO_NONE); - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); } buf_LRU_block_free_hashed_page((buf_block_t*) bpage); @@ -1597,7 +1679,7 @@ alloc: /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() when it returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool_zip_mutex); + ut_ad(block_mutex == &buf_pool->zip_mutex); mutex_enter(block_mutex); } @@ -1612,10 +1694,11 @@ buf_LRU_block_free_non_file_page( /*=============================*/ buf_block_t* block) /*!< in: block, must not contain a file page */ { - void* data; + void* data; + buf_pool_t* buf_pool = buf_pool_from_block(block); ut_ad(block); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(&block->mutex)); switch (buf_block_get_state(block)) { @@ -1649,9 +1732,12 @@ buf_LRU_block_free_non_file_page( if (data) { block->page.zip.data = NULL; mutex_exit(&block->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&block->page.zip)); - buf_pool_mutex_exit_allow(); + buf_pool_mutex_exit_forbid(buf_pool); + + buf_buddy_free( + buf_pool, data, page_zip_get_size(&block->page.zip)); + + buf_pool_mutex_exit_allow(buf_pool); mutex_enter(&block->mutex); page_zip_set_size(&block->page.zip, 0); } @@ -1665,7 +1751,7 @@ buf_LRU_block_free_non_file_page( /******************************************************************//** Takes a block out of the LRU list and page hash table. If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), -the object will be freed and buf_pool_zip_mutex will be released. +the object will be freed and buf_pool->zip_mutex will be released. If a compressed page or a compressed-only block descriptor is freed, other compressed pages or compressed-only block descriptors may be @@ -1684,8 +1770,10 @@ buf_LRU_block_remove_hashed_page( { ulint fold; const buf_page_t* hashed_bpage; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -1767,8 +1855,8 @@ buf_LRU_block_remove_hashed_page( } fold = buf_page_address_fold(bpage->space, bpage->offset); - hashed_bpage = buf_page_hash_get_low(bpage->space, bpage->offset, - fold); + hashed_bpage = buf_page_hash_get_low( + buf_pool, bpage->space, bpage->offset, fold); if (UNIV_UNLIKELY(bpage != hashed_bpage)) { fprintf(stderr, @@ -1788,7 +1876,7 @@ buf_LRU_block_remove_hashed_page( #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); buf_print(); buf_LRU_print(); buf_validate(); @@ -1811,12 +1899,16 @@ buf_LRU_block_remove_hashed_page( UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); - mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(bpage->zip.data, - page_zip_get_size(&bpage->zip)); - buf_buddy_free(bpage, sizeof(*bpage)); - buf_pool_mutex_exit_allow(); + mutex_exit(&buf_pool->zip_mutex); + buf_pool_mutex_exit_forbid(buf_pool); + + buf_buddy_free( + buf_pool, bpage->zip.data, + page_zip_get_size(&bpage->zip)); + + buf_buddy_free(buf_pool, bpage, sizeof(*bpage)); + buf_pool_mutex_exit_allow(buf_pool); + UNIV_MEM_UNDESC(bpage); return(BUF_BLOCK_ZIP_FREE); @@ -1838,9 +1930,13 @@ buf_LRU_block_remove_hashed_page( ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); mutex_exit(&((buf_block_t*) bpage)->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&bpage->zip)); - buf_pool_mutex_exit_allow(); + buf_pool_mutex_exit_forbid(buf_pool); + + buf_buddy_free( + buf_pool, data, + page_zip_get_size(&bpage->zip)); + + buf_pool_mutex_exit_allow(buf_pool); mutex_enter(&((buf_block_t*) bpage)->mutex); page_zip_set_size(&bpage->zip, 0); } @@ -1869,7 +1965,10 @@ buf_LRU_block_free_hashed_page( buf_block_t* block) /*!< in: block, must contain a file page and be in a state where it can be freed */ { - ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_block(block); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_ad(mutex_own(&block->mutex)); buf_block_set_state(block, BUF_BLOCK_MEMORY); @@ -1878,17 +1977,18 @@ buf_LRU_block_free_hashed_page( } /**********************************************************************//** -Updates buf_LRU_old_ratio. +Updates buf_pool->LRU_old_ratio for one buffer pool instance. @return updated old_pct */ -UNIV_INTERN +static uint -buf_LRU_old_ratio_update( -/*=====================*/ - uint old_pct,/*!< in: Reserve this percentage of - the buffer pool for "old" blocks. */ - ibool adjust) /*!< in: TRUE=adjust the LRU list; - FALSE=just assign buf_LRU_old_ratio - during the initialization of InnoDB */ +buf_LRU_old_ratio_update_instance( +/*==============================*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust) /*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ { uint ratio; @@ -1900,27 +2000,55 @@ buf_LRU_old_ratio_update( } if (adjust) { - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - if (ratio != buf_LRU_old_ratio) { - buf_LRU_old_ratio = ratio; + if (ratio != buf_pool->LRU_old_ratio) { + buf_pool->LRU_old_ratio = ratio; if (UT_LIST_GET_LEN(buf_pool->LRU) - >= BUF_LRU_OLD_MIN_LEN) { - buf_LRU_old_adjust_len(); + >= BUF_LRU_OLD_MIN_LEN) { + + buf_LRU_old_adjust_len(buf_pool); } } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } else { - buf_LRU_old_ratio = ratio; + buf_pool->LRU_old_ratio = ratio; } - /* the reverse of ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */ return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5)); } +/**********************************************************************//** +Updates buf_pool->LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +ulint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust) /*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ +{ + ulint i; + ulint new_ratio = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + new_ratio = buf_LRU_old_ratio_update_instance( + buf_pool, old_pct, adjust); + } + + return(new_ratio); +} + /********************************************************************//** Update the historical stats that we are collecting for LRU eviction policy at the end of each interval. */ @@ -1929,14 +2057,25 @@ void buf_LRU_stat_update(void) /*=====================*/ { + ulint i; buf_LRU_stat_t* item; + buf_pool_t* buf_pool; + ibool evict_started = FALSE; /* If we haven't started eviction yet then don't update stats. */ - if (buf_pool->freed_page_clock == 0) { - goto func_exit; + for (i = 0; i < srv_buf_pool_instances; i++) { + + buf_pool = buf_pool_from_array(i); + + if (buf_pool->freed_page_clock != 0) { + evict_started = TRUE; + break; + } } - buf_pool_mutex_enter(); + if (!evict_started) { + goto func_exit; + } /* Update the index. */ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; @@ -1950,8 +2089,6 @@ buf_LRU_stat_update(void) /* Put current entry in the array. */ memcpy(item, &buf_LRU_stat_cur, sizeof *item); - buf_pool_mutex_exit(); - func_exit: /* Clear the current entry. */ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); @@ -1959,12 +2096,12 @@ func_exit: #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /**********************************************************************//** -Validates the LRU list. -@return TRUE */ -UNIV_INTERN -ibool -buf_LRU_validate(void) -/*==================*/ +Validates the LRU list for one buffer pool instance. */ +static +void +buf_LRU_validate_instance( +/*======================*/ + buf_pool_t* buf_pool) { buf_page_t* bpage; buf_block_t* block; @@ -1972,14 +2109,15 @@ buf_LRU_validate(void) ulint new_len; ut_ad(buf_pool); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { ut_a(buf_pool->LRU_old); old_len = buf_pool->LRU_old_len; new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) - * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + * buf_pool->LRU_old_ratio + / BUF_LRU_OLD_RATIO_DIV, UT_LIST_GET_LEN(buf_pool->LRU) - (BUF_LRU_OLD_TOLERANCE + BUF_LRU_NON_OLD_MIN_LEN)); @@ -2055,23 +2193,43 @@ buf_LRU_validate(void) ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); +} + +/**********************************************************************//** +Validates the LRU list. +@return TRUE */ +UNIV_INTERN +ibool +buf_LRU_validate(void) +/*==================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_LRU_validate_instance(buf_pool); + } + return(TRUE); } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /**********************************************************************//** -Prints the LRU list. */ +Prints the LRU list for one buffer pool instance. */ UNIV_INTERN void -buf_LRU_print(void) -/*===============*/ +buf_LRU_print_instance( +/*===================*/ + buf_pool_t* buf_pool) { const buf_page_t* bpage; ut_ad(buf_pool); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); bpage = UT_LIST_GET_FIRST(buf_pool->LRU); @@ -2130,6 +2288,22 @@ buf_LRU_print(void) bpage = UT_LIST_GET_NEXT(LRU, bpage); } - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); +} + +/**********************************************************************//** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void) +/*===============*/ +{ + ulint i; + buf_pool_t* buf_pool; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool = buf_pool_from_array(i); + buf_LRU_print_instance(buf_pool); + } } #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ diff --git a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.c index 81f788baac2..99a56bf91bc 100644 --- a/storage/innobase/buf/buf0rea.c +++ b/storage/innobase/buf/buf0rea.c @@ -171,6 +171,7 @@ buf_read_page( ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ ulint offset) /*!< in: page number */ { + buf_pool_t* buf_pool = buf_pool_get(space, offset); ib_int64_t tablespace_version; ulint count; ulint err; @@ -195,7 +196,7 @@ buf_read_page( } /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(buf_pool); /* Increment number of I/O operations used for LRU policy. */ buf_LRU_stat_inc_io(); @@ -236,6 +237,7 @@ buf_read_ahead_linear( ulint offset) /*!< in: page number of a page; NOTE: the current thread must want access to this page (see NOTE 3 above) */ { + buf_pool_t* buf_pool = buf_pool_get(space, offset); ib_int64_t tablespace_version; buf_page_t* bpage; buf_frame_t* frame; @@ -251,7 +253,7 @@ buf_read_ahead_linear( ulint err; ulint i; const ulint buf_read_ahead_linear_area - = BUF_READ_AHEAD_LINEAR_AREA; + = BUF_READ_AHEAD_LINEAR_AREA(buf_pool); ulint threshold; if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { @@ -286,10 +288,10 @@ buf_read_ahead_linear( tablespace_version = fil_space_get_version(space); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); if (high > fil_space_get_size(space)) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); /* The area is not whole, return */ return(0); @@ -297,7 +299,7 @@ buf_read_ahead_linear( if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(0); } @@ -315,14 +317,14 @@ buf_read_ahead_linear( /* How many out of order accessed pages can we ignore when working out the access pattern for linear readahead */ threshold = ut_min((64 - srv_read_ahead_threshold), - BUF_READ_AHEAD_AREA); + BUF_READ_AHEAD_AREA(buf_pool)); fail_count = 0; for (i = low; i < high; i++) { - bpage = buf_page_hash_get(space, i); + bpage = buf_page_hash_get(buf_pool, space, i); - if ((bpage == NULL) || !buf_page_is_accessed(bpage)) { + if (bpage == NULL || !buf_page_is_accessed(bpage)) { /* Not accessed */ fail_count++; @@ -346,7 +348,7 @@ buf_read_ahead_linear( if (fail_count > threshold) { /* Too many failures: return */ - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(0); } @@ -358,10 +360,10 @@ buf_read_ahead_linear( /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ - bpage = buf_page_hash_get(space, offset); + bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage == NULL) { - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(0); } @@ -387,7 +389,7 @@ buf_read_ahead_linear( pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); if ((offset == low) && (succ_offset == offset + 1)) { @@ -466,7 +468,7 @@ buf_read_ahead_linear( os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + buf_flush_free_margin(buf_pool); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { @@ -518,14 +520,18 @@ buf_read_ibuf_merge_pages( #ifdef UNIV_IBUF_DEBUG ut_a(n_stored < UNIV_PAGE_SIZE); #endif - while (buf_pool->n_pend_reads - > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - os_thread_sleep(500000); - } for (i = 0; i < n_stored; i++) { - ulint zip_size = fil_space_get_zip_size(space_ids[i]); - ulint err; + ulint err; + buf_pool_t* buf_pool; + ulint zip_size = fil_space_get_zip_size(space_ids[i]); + + buf_pool = buf_pool_get(space_ids[i], space_versions[i]); + + while (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + os_thread_sleep(500000); + } if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { @@ -550,8 +556,8 @@ tablespace_deleted: os_aio_simulated_wake_handler_threads(); - /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + /* Flush pages from the end of all the LRU lists if necessary */ + buf_flush_free_margins(); #ifdef UNIV_DEBUG if (buf_debug_prints) { @@ -600,11 +606,12 @@ buf_read_recv_pages( tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { + buf_pool_t* buf_pool; count = 0; os_aio_print_debug = FALSE; - + buf_pool = buf_pool_get(space, page_nos[i]); while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); @@ -643,8 +650,8 @@ buf_read_recv_pages( os_aio_simulated_wake_handler_threads(); - /* Flush pages from the end of the LRU list if necessary */ - buf_flush_free_margin(); + /* Flush pages from the end of all the LRU lists if necessary */ + buf_flush_free_margins(); #ifdef UNIV_DEBUG if (buf_debug_prints) { diff --git a/storage/innobase/ha/ha0ha.c b/storage/innobase/ha/ha0ha.c index db85288298d..9d9d341ad39 100644 --- a/storage/innobase/ha/ha0ha.c +++ b/storage/innobase/ha/ha0ha.c @@ -403,8 +403,6 @@ ha_print_info( FILE* file, /*!< in: file where to print */ hash_table_t* table) /*!< in: hash table */ { - ut_ad(table); - ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); #ifdef UNIV_DEBUG /* Some of the code here is disabled for performance reasons in production builds, see http://bugs.mysql.com/36941 */ @@ -418,6 +416,8 @@ builds, see http://bugs.mysql.com/36941 */ #endif /* PRINT_USED_CELLS */ ulint n_bufs; + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); #ifdef PRINT_USED_CELLS for (i = 0; i < hash_get_n_cells(table); i++) { diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index fb3db95f33e..e526be7c55c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -135,6 +135,7 @@ static long innobase_mirrored_log_groups, innobase_log_files_in_group, static ulong innobase_commit_concurrency = 0; static ulong innobase_read_io_threads; static ulong innobase_write_io_threads; +static long innobase_buffer_pool_instances = 1; static long long innobase_buffer_pool_size, innobase_log_file_size; @@ -241,7 +242,7 @@ static PSI_mutex_info all_innodb_mutexes[] = { {&file_format_max_mutex_key, "file_format_max_mutex", 0}, {&fil_system_mutex_key, "fil_system_mutex", 0}, {&flush_list_mutex_key, "flush_list_mutex", 0}, - {&flush_order_mutex_key, "flush_order_mutex", 0}, + {&log_flush_order_mutex_key, "log_flush_order_mutex", 0}, {&hash_table_mutex_key, "hash_table_mutex", 0}, {&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0}, {&ibuf_mutex_key, "ibuf_mutex", 0}, @@ -2305,6 +2306,7 @@ innobase_change_buffering_inited_ok: srv_log_buffer_size = (ulint) innobase_log_buffer_size; srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; @@ -2348,9 +2350,6 @@ innobase_change_buffering_inited_ok: ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci")); srv_latin1_ordering = my_charset_latin1.sort_order; - innobase_old_blocks_pct = buf_LRU_old_ratio_update( - innobase_old_blocks_pct, FALSE); - innobase_commit_concurrency_init_default(); #ifdef HAVE_PSI_INTERFACE @@ -2404,6 +2403,9 @@ innobase_change_buffering_inited_ok: goto mem_free_and_error; } + innobase_old_blocks_pct = buf_LRU_old_ratio_update( + innobase_old_blocks_pct, TRUE); + innobase_open_tables = hash_create(200); mysql_mutex_init(innobase_share_mutex_key, &innobase_share_mutex, @@ -3337,6 +3339,8 @@ innobase_build_index_translation( DBUG_ENTER("innobase_build_index_translation"); + mutex_enter(&dict_sys->mutex); + mysql_num_index = table->s->keys; ib_num_index = UT_LIST_GET_LEN(ib_table->indexes); @@ -3367,6 +3371,13 @@ innobase_build_index_translation( MYF(MY_ALLOW_ZERO_PTR)); if (!index_mapping) { + /* Report an error if index_mapping continues to be + NULL and mysql_num_index is a non-zero value */ + sql_print_error("InnoDB: fail to allocate memory for " + "index translation table. Number of " + "Index:%lu, array size:%lu", + mysql_num_index, + share->idx_trans_tbl.array_size); ret = FALSE; goto func_exit; } @@ -3374,7 +3385,6 @@ innobase_build_index_translation( share->idx_trans_tbl.array_size = mysql_num_index; } - /* For each index in the mysql key_info array, fetch its corresponding InnoDB index pointer into index_mapping array. */ @@ -3420,6 +3430,8 @@ func_exit: share->idx_trans_tbl.index_mapping = index_mapping; + mutex_exit(&dict_sys->mutex); + DBUG_RETURN(ret); } @@ -10813,6 +10825,11 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); +static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", + NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L); + static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, PLUGIN_VAR_RQCMDARG, "Helps in performance tuning in heavily concurrent environments.", @@ -10948,6 +10965,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), + MYSQL_SYSVAR(buffer_pool_instances), MYSQL_SYSVAR(checksums), MYSQL_SYSVAR(commit_concurrency), MYSQL_SYSVAR(concurrency_tickets), diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 7d8b4a8dd40..ed429a9175f 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -1307,6 +1307,14 @@ static ST_FIELD_INFO i_s_cmpmem_fields_info[] = STRUCT_FLD(old_name, "Buddy Block Size"), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + {STRUCT_FLD(field_name, "buffer_pool_instance"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Buffer Pool Id"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + {STRUCT_FLD(field_name, "pages_used"), STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), @@ -1356,8 +1364,8 @@ i_s_cmpmem_fill_low( COND* cond, /*!< in: condition (ignored) */ ibool reset) /*!< in: TRUE=reset cumulated counts */ { + int status = 0; TABLE* table = (TABLE *) tables->table; - int status = 0; DBUG_ENTER("i_s_cmpmem_fill_low"); @@ -1369,33 +1377,50 @@ i_s_cmpmem_fill_low( RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); - buf_pool_mutex_enter(); + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; - for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { - buf_buddy_stat_t* buddy_stat = &buf_buddy_stat[x]; + status = 0; - table->field[0]->store(BUF_BUDDY_LOW << x); - table->field[1]->store(buddy_stat->used); - table->field[2]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES) - ? UT_LIST_GET_LEN(buf_pool->zip_free[x]) - : 0); - table->field[3]->store((longlong) buddy_stat->relocated, true); - table->field[4]->store( - (ulong) (buddy_stat->relocated_usec / 1000000)); + buf_pool = buf_pool_from_array(i); - if (reset) { - /* This is protected by buf_pool_mutex. */ - buddy_stat->relocated = 0; - buddy_stat->relocated_usec = 0; + buf_pool_mutex_enter(buf_pool); + + for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { + buf_buddy_stat_t* buddy_stat; + + buddy_stat = &buf_pool->buddy_stat[x]; + + table->field[0]->store(BUF_BUDDY_LOW << x); + table->field[1]->store(i); + table->field[2]->store(buddy_stat->used); + table->field[3]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES) + ? UT_LIST_GET_LEN(buf_pool->zip_free[x]) + : 0); + table->field[4]->store((longlong) + buddy_stat->relocated, true); + table->field[5]->store( + (ulong) (buddy_stat->relocated_usec / 1000000)); + + if (reset) { + /* This is protected by buf_pool->mutex. */ + buddy_stat->relocated = 0; + buddy_stat->relocated_usec = 0; + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } } - if (schema_table_store_record(thd, table)) { - status = 1; + buf_pool_mutex_exit(buf_pool); + + if (status) { break; } } - buf_pool_mutex_exit(); DBUG_RETURN(status); } diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c index d405d90fe25..0397af88ff4 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.c +++ b/storage/innobase/ibuf/ibuf0ibuf.c @@ -2323,7 +2323,7 @@ ibuf_get_merge_page_nos( *n_stored = 0; - limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4); + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool_get_curr_size() / 4); if (page_rec_is_supremum(rec)) { @@ -3139,9 +3139,9 @@ ibuf_set_entry_counter( ibool is_optimistic, /*!< in: is this an optimistic insert */ mtr_t* mtr) /*!< in: mtr */ { - ulint counter; dfield_t* field; byte* data; + ulint counter = 0; /* pcur points to either a user rec or to a page's infimum record. */ ut_ad(page_validate(btr_pcur_get_page(pcur), ibuf->index)); @@ -3682,10 +3682,11 @@ check_watch: { buf_page_t* bpage; ulint fold = buf_page_address_fold(space, page_no); + buf_pool_t* buf_pool = buf_pool_get(space, page_no); - buf_pool_mutex_enter(); - bpage = buf_page_hash_get_low(space, page_no, fold); - buf_pool_mutex_exit(); + buf_pool_mutex_enter(buf_pool); + bpage = buf_page_hash_get_low(buf_pool, space, page_no, fold); + buf_pool_mutex_exit(buf_pool); if (UNIV_LIKELY_NULL(bpage)) { /* A buffer pool watch has been set or the diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index 7648950d5d1..03588d18197 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -36,22 +36,24 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold -buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired +buf_pool->mutex and must not hold buf_pool_zip_mutex or any +block->mutex. The buf_pool->mutex may only be released and reacquired if lru != NULL. This function should only be used for allocating compressed page frames or control blocks (buf_page_t). Allocated control blocks must be properly initialized immediately after buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. +buf_pool->mutex. @return allocated block, possibly NULL if lru == NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ + buf_pool_t* buf_pool, + /*!< buffer pool in which the block resides */ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, + and buf_pool->mutex was temporarily released, or NULL if the LRU list should not be used */ __attribute__((malloc)); @@ -61,28 +63,13 @@ UNIV_INLINE void buf_buddy_free( /*===========*/ + buf_pool_t* buf_pool, + /*!< buffer pool in which the block resides */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ __attribute__((nonnull)); -/** Statistics of buddy blocks of a given size. */ -struct buf_buddy_stat_struct { - /** Number of blocks allocated from the buddy system. */ - ulint used; - /** Number of blocks relocated by the buddy system. */ - ib_uint64_t relocated; - /** Total duration of block relocations, in microseconds. */ - ib_uint64_t relocated_usec; -}; - -/** Statistics of buddy blocks of a given size. */ -typedef struct buf_buddy_stat_struct buf_buddy_stat_t; - -/** Statistics of the buddy system, indexed by block size. -Protected by buf_pool_mutex. */ -extern buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1]; - #ifndef UNIV_NONINL # include "buf0buddy.ic" #endif diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic index c419a2374d9..387eacc754a 100644 --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -35,18 +35,20 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold -buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. +buf_pool->mutex and must not hold buf_pool_zip_mutex or any block->mutex. +The buf_pool->mutex may only be released and reacquired if lru != NULL. @return allocated block, possibly NULL if lru==NULL */ UNIV_INTERN void* buf_buddy_alloc_low( /*================*/ + buf_pool_t* buf_pool, + /*!< in: buffer pool in which the page resides */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, + and buf_pool->mutex was temporarily released, or NULL if the LRU list should not be used */ __attribute__((malloc)); @@ -56,10 +58,11 @@ UNIV_INTERN void buf_buddy_free_low( /*===============*/ - void* buf, /*!< in: block to be freed, must not be - pointed to by the buffer pool */ - ulint i) /*!< in: index of buf_pool->zip_free[], - or BUF_BUDDY_SIZES */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ __attribute__((nonnull)); /**********************************************************************//** @@ -83,27 +86,32 @@ buf_buddy_get_slot( /**********************************************************************//** Allocate a block. The thread calling this function must hold -buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired +buf_pool->mutex and must not hold buf_pool_zip_mutex or any +block->mutex. The buf_pool->mutex may only be released and reacquired if lru != NULL. This function should only be used for allocating compressed page frames or control blocks (buf_page_t). Allocated control blocks must be properly initialized immediately after buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. +buf_pool->mutex. @return allocated block, possibly NULL if lru == NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /*!< in: pointer to a variable that will be assigned - TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + buf_pool_t* buf_pool, /*!< in: buffer pool in which + the page resides */ + ulint size, /*!< in: block size, up to + UNIV_PAGE_SIZE */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->mutex was + temporarily released, or NULL if + the LRU list should not be used */ { - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); - return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); + return(buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), lru)); } /**********************************************************************//** @@ -112,13 +120,15 @@ UNIV_INLINE void buf_buddy_free( /*===========*/ - void* buf, /*!< in: block to be freed, must not be - pointed to by the buffer pool */ - ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint size) /*!< in: block size, up to + UNIV_PAGE_SIZE */ { - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); - buf_buddy_free_low(buf, buf_buddy_get_slot(size)); + buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); } #ifdef UNIV_MATERIALIZE diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 62e4f54559a..5326ca9c14f 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -63,7 +63,14 @@ Created 11/5/1995 Heikki Tuuri position of the block. */ /* @} */ -extern buf_pool_t* buf_pool; /*!< The buffer pool of the database */ +#define MAX_BUFFER_POOLS 64 /*!< The maximum number of buffer + pools that can be defined */ + +#define BUF_POOL_WATCH_SIZE 1 /*!< Maximum number of concurrent + buffer pool watches */ + +extern buf_pool_t* buf_pool_ptr[MAX_BUFFER_POOLS]; /*!< The buffer pools + of the database */ #ifdef UNIV_DEBUG extern ibool buf_debug_prints;/*!< If this is set TRUE, the program prints info whenever read or flush @@ -71,6 +78,8 @@ extern ibool buf_debug_prints;/*!< If this is set TRUE, the program #endif /* UNIV_DEBUG */ extern ulint srv_buf_pool_write_requests; /*!< variable to count write request issued */ +extern ulint srv_buf_pool_instances; +extern ulint srv_buf_pool_curr_size; #else /* !UNIV_HOTBACKUP */ extern buf_block_t* back_block1; /*!< first block, for --apply-log */ extern buf_block_t* back_block2; /*!< second block, for page reorganize */ @@ -109,19 +118,36 @@ enum buf_page_state { #ifndef UNIV_HOTBACKUP /********************************************************************//** +Acquire mutex on all buffer pool instances */ +UNIV_INLINE +void +buf_pool_mutex_enter_all(void); +/*===========================*/ + +/********************************************************************//** +Release mutex on all buffer pool instances */ +UNIV_INLINE +void +buf_pool_mutex_exit_all(void); +/*==========================*/ + +/********************************************************************//** Creates the buffer pool. @return own: buf_pool object, NULL if not enough memory or error */ UNIV_INTERN -buf_pool_t* -buf_pool_init(void); -/*===============*/ +ulint +buf_pool_init( +/*=========*/ + ulint size, /*!< in: Size of the total pool in bytes */ + ulint n_instances); /*!< in: Number of instances */ /********************************************************************//** Frees the buffer pool at shutdown. This must not be invoked before freeing all mutexes. */ UNIV_INTERN void -buf_pool_free(void); -/*===============*/ +buf_pool_free( +/*==========*/ + ulint n_instances); /*!< in: numbere of instances to free */ /********************************************************************//** Drops the adaptive hash index. To prevent a livelock, this function @@ -158,23 +184,31 @@ UNIV_INLINE ulint buf_pool_get_curr_size(void); /*========================*/ +/*********************************************************************//** +Gets the current size of buffer buf_pool in frames. +@return size in pages */ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void); +/*=======================*/ /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. @return oldest modification in pool, zero if none */ -UNIV_INLINE +UNIV_INTERN ib_uint64_t buf_pool_get_oldest_modification(void); /*==================================*/ /********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ -UNIV_INLINE +UNIV_INTERN buf_block_t* buf_block_alloc( /*============*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint zip_size); /*!< in: compressed page size in bytes, + or 0 if uncompressed tablespace */ /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE @@ -454,7 +488,7 @@ buf_page_get_newest_modification( page frame */ /********************************************************************//** Increments the modify clock of a frame by 1. The caller must (1) own the -buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock on the block. */ UNIV_INLINE void @@ -536,7 +570,8 @@ UNIV_INTERN buf_block_t* buf_pool_contains_zip( /*==================*/ - const void* data); /*!< in: pointer to compressed page */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* data); /*!< in: pointer to compressed page */ #endif /* UNIV_DEBUG */ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /*********************************************************************//** @@ -610,8 +645,15 @@ buf_get_modified_ratio_pct(void); Refreshes the statistics used to print per-second averages. */ UNIV_INTERN void -buf_refresh_io_stats(void); -/*======================*/ +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats_all(void); +/*=================*/ /*********************************************************************//** Asserts that all file pages in the buffer are in a replaceable state. @return TRUE */ @@ -992,15 +1034,51 @@ buf_page_address_fold( ulint offset) /*!< in: offset of the page within space */ __attribute__((const)); /******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage); /*!< in: buffer pool page */ +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block); /*!< in: block */ +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: offset of the page within space */ +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*====================*/ + ulint index); /*!< in: array index to get buffer pool instance from */ +/******************************************************************//** Returns the control block of a file page, NULL if not found. @return block, NULL if not found */ UNIV_INLINE buf_page_t* buf_page_hash_get_low( /*==================*/ - ulint space, /*!< in: space id */ - ulint offset, /*!< in: offset of the page within space */ - ulint fold); /*!< in: buf_page_address_fold(space, offset) */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page + within space */ + ulint fold); /*!< in: buf_page_address_fold( + space, offset) */ /******************************************************************//** Returns the control block of a file page, NULL if not found. @return block, NULL if not found or not a real control block */ @@ -1008,8 +1086,10 @@ UNIV_INLINE buf_page_t* buf_page_hash_get( /*==============*/ - ulint space, /*!< in: space id */ - ulint offset);/*!< in: offset of the page within space */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset); /*!< in: offset of the page + within space */ /******************************************************************//** Returns the control block of a file page, NULL if not found or an uncompressed page frame does not exist. @@ -1018,8 +1098,10 @@ UNIV_INLINE buf_block_t* buf_block_hash_get( /*===============*/ - ulint space, /*!< in: space id */ - ulint offset);/*!< in: offset of the page within space */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset); /*!< in: offset of the page + within space */ /*********************************************************************//** Gets the current length of the free list of buffer blocks. @return length of the free list */ @@ -1033,9 +1115,10 @@ Determine if a block is a sentinel for a buffer pool watch. @return TRUE if a sentinel for a buffer pool watch, FALSE if not */ UNIV_INTERN ibool -buf_pool_watch_is( -/*==============*/ - const buf_page_t* bpage) /*!< in: block */ +buf_pool_watch_is_sentinel( +/*=======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + const buf_page_t* bpage) /*!< in: block */ __attribute__((nonnull, warn_unused_result)); /****************************************************************//** Add watch for the given page to be read in. Caller must have the buffer pool @@ -1069,6 +1152,23 @@ buf_pool_watch_occurred( ulint space, /*!< in: space id */ ulint offset) /*!< in: page number */ __attribute__((warn_unused_result)); +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_list_len( +/*===================*/ + ulint* LRU_len, /*!< out: length of all LRU lists */ + ulint* free_len, /*!< out: length of all free lists */ + ulint* flush_list_len);/*!< out: length of all flush lists */ +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_stat( +/*===============*/ + buf_pool_stat_t*tot_stat); /*!< out: buffer pool stats */ + #endif /* !UNIV_HOTBACKUP */ /** The common buffer control block structure @@ -1078,18 +1178,18 @@ struct buf_page_struct{ /** @name General fields None of these bit-fields must be modified without holding buf_page_get_mutex() [buf_block_struct::mutex or - buf_pool_zip_mutex], since they can be stored in the same + buf_pool->zip_mutex], since they can be stored in the same machine word. Some of these fields are additionally protected - by buf_pool_mutex. */ + by buf_pool->mutex. */ /* @{ */ unsigned space:32; /*!< tablespace id; also protected - by buf_pool_mutex. */ + by buf_pool->mutex. */ unsigned offset:32; /*!< page number; also protected - by buf_pool_mutex. */ + by buf_pool->mutex. */ unsigned state:3; /*!< state of the control block; also - protected by buf_pool_mutex. + protected by buf_pool->mutex. State transitions from BUF_BLOCK_READY_FOR_USE to BUF_BLOCK_MEMORY need not be @@ -1101,7 +1201,7 @@ struct buf_page_struct{ flush_type. @see enum buf_flush */ unsigned io_fix:2; /*!< type of pending I/O operation; - also protected by buf_pool_mutex + also protected by buf_pool->mutex @see enum buf_io_fix */ unsigned buf_fix_count:25;/*!< count of how manyfold this block is currently bufferfixed */ @@ -1190,8 +1290,8 @@ struct buf_page_struct{ any one of the two mutexes */ /* @} */ /** @name LRU replacement algorithm fields - These fields are protected by buf_pool_mutex only (not - buf_pool_zip_mutex or buf_block_struct::mutex). */ + These fields are protected by buf_pool->mutex only (not + buf_pool->zip_mutex or buf_block_struct::mutex). */ /* @{ */ UT_LIST_NODE_T(buf_page_t) LRU; @@ -1221,6 +1321,8 @@ struct buf_page_struct{ frees a page in buffer pool */ # endif /* UNIV_DEBUG_FILE_ACCESSES */ #endif /* !UNIV_HOTBACKUP */ + buf_pool_t* buf_pool; /*!< buffer pool instance this + page belongs to */ }; /** The buffer control block structure */ @@ -1260,7 +1362,7 @@ struct buf_block_struct{ unsigned lock_hash_val:32;/*!< hashed value of the page address in the record lock hash table; protected by buf_block_t::lock - (or buf_block_t::mutex, buf_pool_mutex + (or buf_block_t::mutex, buf_pool->mutex in buf_page_get_gen(), buf_page_init_for_read() and buf_page_create()) */ @@ -1389,6 +1491,16 @@ struct buf_pool_stat_struct{ buf_page_peek_if_too_old() */ }; +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_struct { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + /** @brief The buffer pool structure. NOTE! The definition appears here only for other modules of this @@ -1398,7 +1510,25 @@ struct buf_pool_struct{ /** @name General fields */ /* @{ */ - + mutex_t mutex; /*!< Buffer pool mutex of this + instance */ + mutex_t zip_mutex; /*!< Zip mutex of this buffer + pool instance, protects compressed + only pages (of type buf_page_t, not + buf_block_t */ + ulint instance_no; /*!< Array index of this buffer + pool instance */ + ulint old_pool_size; /*!< Old pool size in bytes */ + ulint curr_pool_size; /*!< Current pool size in bytes */ + ulint LRU_old_ratio; /*!< Reserve this much of the buffer + pool for "old" blocks */ +#ifdef UNIV_DEBUG + ulint buddy_n_frames; /*!< Number of frames allocated from + the buffer pool to the buddy system */ +#endif +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ulint mutex_exit_forbidden; /*!< Forbid release mutex */ +#endif ulint n_chunks; /*!< number of buffer pool chunks */ buf_chunk_t* chunks; /*!< buffer pool chunks */ ulint curr_size; /*!< current pool size in pages */ @@ -1410,12 +1540,16 @@ struct buf_pool_struct{ whose frames are allocated to the zip buddy system, indexed by block->frame */ - ulint n_pend_reads; /*!< number of pending read operations */ + ulint n_pend_reads; /*!< number of pending read + operations */ ulint n_pend_unzip; /*!< number of pending decompressions */ time_t last_printout_time; /*!< when buf_print_io was last time called */ + buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1]; + /*!< Statistics of buddy system, + indexed by block size */ buf_pool_stat_t stat; /*!< current statistics */ buf_pool_stat_t old_stat; /*!< old statistics */ @@ -1432,14 +1566,6 @@ struct buf_pool_struct{ the bpage is on flush_list. It also protects writes to bpage::oldest_modification */ - mutex_t flush_order_mutex;/*!< mutex to serialize access to - the flush list when we are putting - dirty blocks in the list. The idea - behind this mutex is to be able - to release log_sys->mutex during - mtr_commit and still ensure that - insertions in the flush_list happen - in the LSN order. */ UT_LIST_BASE_NODE_T(buf_page_t) flush_list; /*!< base node of the modified block list */ @@ -1519,6 +1645,12 @@ struct buf_pool_struct{ /*!< unmodified compressed pages */ UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES]; /*!< buddy free lists */ + + buf_page_t watch[BUF_POOL_WATCH_SIZE]; + /*!< Sentinel records for buffer + pool watches. Protected by + buf_pool->mutex. */ + #if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE # error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE" #endif @@ -1540,65 +1672,51 @@ Use these instead of accessing buf_pool_mutex directly. */ /* @{ */ /** Test if buf_pool_mutex is owned. */ -#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex) +#define buf_pool_mutex_own(b) mutex_own(&b->mutex) /** Acquire the buffer pool mutex. */ -#define buf_pool_mutex_enter() do { \ - ut_ad(!mutex_own(&buf_pool_zip_mutex)); \ - mutex_enter(&buf_pool_mutex); \ +#define buf_pool_mutex_enter(b) do { \ + ut_ad(!mutex_own(&b->zip_mutex)); \ + mutex_enter(&b->mutex); \ } while (0) /** Test if flush list mutex is owned. */ -#define buf_flush_list_mutex_own() mutex_own(&buf_pool->flush_list_mutex) +#define buf_flush_list_mutex_own(b) mutex_own(&b->flush_list_mutex) /** Acquire the flush list mutex. */ -#define buf_flush_list_mutex_enter() do { \ - mutex_enter(&buf_pool->flush_list_mutex); \ +#define buf_flush_list_mutex_enter(b) do { \ + mutex_enter(&b->flush_list_mutex); \ } while (0) /** Release the flush list mutex. */ -# define buf_flush_list_mutex_exit() do { \ - mutex_exit(&buf_pool->flush_list_mutex); \ +# define buf_flush_list_mutex_exit(b) do { \ + mutex_exit(&b->flush_list_mutex); \ } while (0) -/** Test if flush order mutex is owned. */ -#define buf_flush_order_mutex_own() mutex_own(&buf_pool->flush_order_mutex) - -/** Acquire the flush order mutex. */ -#define buf_flush_order_mutex_enter() do { \ - mutex_enter(&buf_pool->flush_order_mutex); \ -} while (0) -/** Release the flush order mutex. */ -# define buf_flush_order_mutex_exit() do { \ - mutex_exit(&buf_pool->flush_order_mutex); \ -} while (0) #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG -/** Flag to forbid the release of the buffer pool mutex. -Protected by buf_pool_mutex. */ -extern ulint buf_pool_mutex_exit_forbidden; /** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid() do { \ - ut_ad(buf_pool_mutex_own()); \ - buf_pool_mutex_exit_forbidden++; \ +# define buf_pool_mutex_exit_forbid(b) do { \ + ut_ad(buf_pool_mutex_own(b)); \ + b->mutex_exit_forbidden++; \ } while (0) /** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow() do { \ - ut_ad(buf_pool_mutex_own()); \ - ut_a(buf_pool_mutex_exit_forbidden); \ - buf_pool_mutex_exit_forbidden--; \ +# define buf_pool_mutex_exit_allow(b) do { \ + ut_ad(buf_pool_mutex_own(b)); \ + ut_a(b->mutex_exit_forbidden); \ + b->mutex_exit_forbidden--; \ } while (0) /** Release the buffer pool mutex. */ -# define buf_pool_mutex_exit() do { \ - ut_a(!buf_pool_mutex_exit_forbidden); \ - mutex_exit(&buf_pool_mutex); \ +# define buf_pool_mutex_exit(b) do { \ + ut_a(!b->mutex_exit_forbidden); \ + mutex_exit(&b->mutex); \ } while (0) #else /** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid() ((void) 0) +# define buf_pool_mutex_exit_forbid(b) ((void) 0) /** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow() ((void) 0) +# define buf_pool_mutex_exit_allow(b) ((void) 0) /** Release the buffer pool mutex. */ -# define buf_pool_mutex_exit() mutex_exit(&buf_pool_mutex) +# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex) #endif #endif /* !UNIV_HOTBACKUP */ /* @} */ diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index b9a9662fdc5..c30be5b2635 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -31,11 +31,32 @@ Created 11/5/1995 Heikki Tuuri *******************************************************/ #include "mtr0mtr.h" -#ifndef UNIV_HOTBACKUP #include "buf0flu.h" #include "buf0lru.h" #include "buf0rea.h" +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void) +/*========================*/ +{ + return(srv_buf_pool_curr_size); +} + +/*********************************************************************//** +Gets the current size of buffer buf_pool in pages. +@return size in pages*/ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void) +/*======================*/ +{ + return(buf_pool_get_curr_size() / UNIV_PAGE_SIZE); +} + /********************************************************************//** Reads the freed_page_clock of a buffer block. @return freed_page_clock */ @@ -45,7 +66,7 @@ buf_page_get_freed_page_clock( /*==========================*/ const buf_page_t* bpage) /*!< in: block */ { - /* This is sometimes read without holding buf_pool_mutex. */ + /* This is sometimes read without holding buf_pool->mutex. */ return(bpage->freed_page_clock); } @@ -72,6 +93,8 @@ buf_page_peek_if_too_old( /*=====================*/ const buf_page_t* bpage) /*!< in: block to make younger */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + if (UNIV_UNLIKELY(buf_pool->freed_page_clock == 0)) { /* If eviction has not started yet, do not update the statistics or move blocks in the LRU list. This is @@ -93,55 +116,12 @@ buf_page_peek_if_too_old( return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) > ((ulint) bpage->freed_page_clock + (buf_pool->curr_size - * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) + * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio) / (BUF_LRU_OLD_RATIO_DIV * 4)))); } } /*********************************************************************//** -Gets the current size of buffer buf_pool in bytes. -@return size in bytes */ -UNIV_INLINE -ulint -buf_pool_get_curr_size(void) -/*========================*/ -{ - return(buf_pool->curr_size * UNIV_PAGE_SIZE); -} - -/********************************************************************//** -Gets the smallest oldest_modification lsn for any page in the pool. Returns -zero if all modified pages have been flushed to disk. -@return oldest modification in pool, zero if none */ -UNIV_INLINE -ib_uint64_t -buf_pool_get_oldest_modification(void) -/*==================================*/ -{ - buf_page_t* bpage; - ib_uint64_t lsn; - - buf_flush_list_mutex_enter(); - - bpage = UT_LIST_GET_LAST(buf_pool->flush_list); - - if (bpage == NULL) { - lsn = 0; - } else { - ut_ad(bpage->in_flush_list); - lsn = bpage->oldest_modification; - } - - buf_flush_list_mutex_exit(); - - /* The returned answer may be out of date: the flush_list can - change after the mutex has been released. */ - - return(lsn); -} -#endif /* !UNIV_HOTBACKUP */ - -/*********************************************************************//** Gets the state of a block. @return state */ UNIV_INLINE @@ -293,13 +273,15 @@ buf_page_get_mutex( /*===============*/ const buf_page_t* bpage) /*!< in: pointer to control block */ { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: ut_error; return(NULL); case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - return(&buf_pool_zip_mutex); + return(&buf_pool->zip_mutex); default: return(&((buf_block_t*) bpage)->mutex); } @@ -385,7 +367,7 @@ Gets the io_fix state of a block. UNIV_INLINE enum buf_io_fix buf_block_get_io_fix( -/*================*/ +/*=================*/ const buf_block_t* block) /*!< in: pointer to the control block */ { return(buf_page_get_io_fix(&block->page)); @@ -400,7 +382,10 @@ buf_page_set_io_fix( buf_page_t* bpage, /*!< in/out: control block */ enum buf_io_fix io_fix) /*!< in: io_fix state */ { - ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); bpage->io_fix = io_fix; @@ -428,7 +413,10 @@ buf_page_can_relocate( /*==================*/ const buf_page_t* bpage) /*!< control block being relocated */ { - ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); @@ -446,8 +434,11 @@ buf_page_is_old( /*============*/ const buf_page_t* bpage) /*!< in: control block */ { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); return(bpage->old); } @@ -461,8 +452,11 @@ buf_page_set_old( buf_page_t* bpage, /*!< in/out: control block */ ibool old) /*!< in: old */ { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif /* UNIV_DEBUG */ ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(bpage->in_LRU_list); #ifdef UNIV_LRU_DEBUG @@ -508,8 +502,11 @@ buf_page_set_accessed( buf_page_t* bpage, /*!< in/out: control block */ ulint time_ms) /*!< in: ut_time_ms() */ { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); if (!bpage->access_time) { /* Make this the time of the first access. */ @@ -715,25 +712,6 @@ buf_block_get_lock_hash_val( } /********************************************************************//** -Allocates a buffer block. -@return own: the allocated block, in state BUF_BLOCK_MEMORY */ -UNIV_INLINE -buf_block_t* -buf_block_alloc( -/*============*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ -{ - buf_block_t* block; - - block = buf_LRU_get_free_block(zip_size); - - buf_block_set_state(block, BUF_BLOCK_MEMORY); - - return(block); -} - -/********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE void @@ -741,7 +719,9 @@ buf_block_free( /*===========*/ buf_block_t* block) /*!< in, own: block to be freed */ { - buf_pool_mutex_enter(); + buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*)block); + + buf_pool_mutex_enter(buf_pool); mutex_enter(&block->mutex); @@ -751,7 +731,7 @@ buf_block_free( mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); } #endif /* !UNIV_HOTBACKUP */ @@ -825,7 +805,9 @@ buf_block_modify_clock_inc( buf_block_t* block) /*!< in: block */ { #ifdef UNIV_SYNC_DEBUG - ut_ad((buf_pool_mutex_own() + buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*)block); + + ut_ad((buf_pool_mutex_own(buf_pool) && (block->page.buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ @@ -904,20 +886,83 @@ buf_block_buf_fix_dec( } /******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage) /*!< in: buffer pool page */ +{ + /* Every page must be in some buffer pool. */ + ut_ad(bpage->buf_pool != NULL); + + return(bpage->buf_pool); +} + +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_pool_from_bpage(&block->page)); +} + +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ +{ + ulint fold; + ulint index; + ulint ignored_offset; + + ignored_offset = offset >> 6; /* 2log of BUF_READ_AHEAD_AREA (64)*/ + fold = buf_page_address_fold(space, ignored_offset); + index = fold % srv_buf_pool_instances; + return buf_pool_ptr[index]; +} + +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*================*/ + ulint index) /*!< in: array index to get + buffer pool instance from */ +{ + return buf_pool_ptr[index]; +} + +/******************************************************************//** Returns the control block of a file page, NULL if not found. @return block, NULL if not found */ UNIV_INLINE buf_page_t* buf_page_hash_get_low( /*==================*/ - ulint space, /*!< in: space id */ - ulint offset, /*!< in: offset of the page within space */ - ulint fold) /*!< in: buf_page_address_fold(space, offset) */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page + within space */ + ulint fold) /*!< in: buf_page_address_fold( + space, offset) */ { buf_page_t* bpage; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(fold == buf_page_address_fold(space, offset)); /* Look for the page in the hash table */ @@ -943,13 +988,17 @@ UNIV_INLINE buf_page_t* buf_page_hash_get( /*==============*/ - ulint space, /*!< in: space id */ - ulint offset) /*!< in: offset of the page within space */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page + within space */ { + buf_page_t* bpage; ulint fold = buf_page_address_fold(space, offset); - buf_page_t* bpage = buf_page_hash_get_low(space, offset, fold); - if (bpage && UNIV_UNLIKELY(buf_pool_watch_is(bpage))) { + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (bpage && buf_pool_watch_is_sentinel(buf_pool, bpage)) { bpage = NULL; } @@ -964,12 +1013,14 @@ UNIV_INLINE buf_block_t* buf_block_hash_get( /*===============*/ - ulint space, /*!< in: space id */ - ulint offset) /*!< in: offset of the page within space */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page + within space */ { buf_block_t* block; - block = buf_page_get_block(buf_page_hash_get(space, offset)); + block = buf_page_get_block(buf_page_hash_get(buf_pool, space, offset)); return(block); } @@ -989,12 +1040,13 @@ buf_page_peek( ulint offset) /*!< in: page number */ { const buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); - buf_pool_mutex_enter(); + buf_pool_mutex_enter(buf_pool); - bpage = buf_page_hash_get(space, offset); + bpage = buf_page_hash_get(buf_pool, space, offset); - buf_pool_mutex_exit(); + buf_pool_mutex_exit(buf_pool); return(bpage != NULL); } @@ -1008,6 +1060,7 @@ buf_page_release_zip( buf_page_t* bpage) /*!< in: buffer block */ { buf_block_t* block; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(bpage); ut_a(bpage->buf_fix_count > 0); @@ -1015,9 +1068,9 @@ buf_page_release_zip( switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&buf_pool->zip_mutex); bpage->buf_fix_count--; - mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&buf_pool->zip_mutex); return; case BUF_BLOCK_FILE_PAGE: block = (buf_block_t*) bpage; @@ -1036,6 +1089,7 @@ buf_page_release_zip( break; } + ut_error; } @@ -1087,4 +1141,37 @@ buf_block_dbg_add_level( sync_thread_add_level(&block->lock, level); } #endif /* UNIV_SYNC_DEBUG */ +/********************************************************************//** +Acquire mutex on all buffer pool instances. */ +UNIV_INLINE +void +buf_pool_mutex_enter_all(void) +/*==========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_pool_mutex_enter(buf_pool); + } +} + +/********************************************************************//** +Release mutex on all buffer pool instances. */ +UNIV_INLINE +void +buf_pool_mutex_exit_all(void) +/*=========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_pool_mutex_exit(buf_pool); + } +} #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index c76fcace46e..55814b6bf86 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -31,6 +31,7 @@ Created 11/5/1995 Heikki Tuuri #ifndef UNIV_HOTBACKUP #include "mtr0types.h" #include "buf0types.h" +#include "log0log.h" /********************************************************************//** Remove a block from the flush list of modified blocks. */ @@ -58,11 +59,19 @@ buf_flush_write_complete( buf_page_t* bpage); /*!< in: pointer to the block in question */ /*********************************************************************//** Flushes pages from the end of the LRU list if there is too small -a margin of replaceable pages there. */ +a margin of replaceable pages there. If buffer pool is NULL it +means flush free margin on all buffer pool instances. */ UNIV_INTERN void -buf_flush_free_margin(void); -/*=======================*/ +buf_flush_free_margin( +/*==================*/ + buf_pool_t* buf_pool); +/*********************************************************************//** +Flushes pages from the end of all the LRU lists. */ +UNIV_INTERN +void +buf_flush_free_margins(void); +/*=========================*/ #endif /* !UNIV_HOTBACKUP */ /********************************************************************//** Initializes a page for writing to the tablespace. */ @@ -76,21 +85,30 @@ buf_flush_init_for_writing( to the page */ #ifndef UNIV_HOTBACKUP /*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list or flush_list. -NOTE 1: in the case of an LRU flush the calling thread may own latches to -pages: to avoid deadlocks, this function must be written so that it cannot -end up waiting for these latches! NOTE 2: in the case of a flush list flush, -the calling thread is not allowed to own any latches on pages! +This utility flushes dirty blocks from the end of the LRU list. +NOTE: The calling thread may own latches to pages: to avoid deadlocks, +this function must be written so that it cannot end up waiting for these +latches! +@return number of blocks for which the write request was queued; +ULINT_UNDEFINED if there was a flush of the same type already running */ +UNIV_INTERN +ulint +buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint min_n); /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush_list of +all buffer pool instances. +NOTE: The calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued; ULINT_UNDEFINED if there was a flush of the same type already running */ UNIV_INTERN ulint -buf_flush_batch( +buf_flush_list( /*============*/ - enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or - BUF_FLUSH_LIST; if BUF_FLUSH_LIST, - then the caller must not own any - latches on pages */ ulint min_n, /*!< in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ @@ -105,7 +123,9 @@ UNIV_INTERN void buf_flush_wait_batch_end( /*=====================*/ - enum buf_flush type); /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + enum buf_flush type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ /********************************************************************//** This function should be called at a mini-transaction commit, if a page was modified in it. Puts the block to the list of modified blocks, if it not @@ -181,8 +201,9 @@ Validates the flush list. @return TRUE if ok */ UNIV_INTERN ibool -buf_flush_validate(void); -/*====================*/ +buf_flush_validate( +/*===============*/ + buf_pool_t* buf_pool); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /********************************************************************//** @@ -205,9 +226,10 @@ buf_flush_free_flush_rbt(void); available to replacement in the free list and at the end of the LRU list (to make sure that a read-ahead batch can be read efficiently in a single sweep). */ -#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA) +#define BUF_FLUSH_FREE_BLOCK_MARGIN(b) (5 + BUF_READ_AHEAD_AREA(b)) /** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */ -#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100) +#define BUF_FLUSH_EXTRA_MARGIN(b) (BUF_FLUSH_FREE_BLOCK_MARGIN(b) / 4 \ + + 100) #endif /* !UNIV_HOTBACKUP */ #ifndef UNIV_NONINL diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index fb71932e453..30e2cc8efe8 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -33,8 +33,9 @@ UNIV_INTERN void buf_flush_insert_into_flush_list( /*=============================*/ - buf_block_t* block, /*!< in/out: block which is modified */ - ib_uint64_t lsn); /*!< in: oldest modification */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + ib_uint64_t lsn); /*!< in: oldest modification */ /********************************************************************//** Inserts a modified block into the flush list in the right sorted position. This function is used by recovery, because there the modifications do not @@ -43,8 +44,9 @@ UNIV_INTERN void buf_flush_insert_sorted_into_flush_list( /*====================================*/ - buf_block_t* block, /*!< in/out: block which is modified */ - ib_uint64_t lsn); /*!< in: oldest modification */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + ib_uint64_t lsn); /*!< in: oldest modification */ /********************************************************************//** This function should be called at a mini-transaction commit, if a page was @@ -57,6 +59,8 @@ buf_flush_note_modification( buf_block_t* block, /*!< in: block which is modified */ mtr_t* mtr) /*!< in: mtr */ { + buf_pool_t* buf_pool = buf_pool_from_block(block); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); @@ -64,9 +68,9 @@ buf_flush_note_modification( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!buf_pool_mutex_own()); - ut_ad(!buf_flush_list_mutex_own()); - ut_ad(buf_flush_order_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); ut_ad(mtr->start_lsn != 0); ut_ad(mtr->modifications); @@ -77,7 +81,8 @@ buf_flush_note_modification( block->page.newest_modification = mtr->end_lsn; if (!block->page.oldest_modification) { - buf_flush_insert_into_flush_list(block, mtr->start_lsn); + buf_flush_insert_into_flush_list( + buf_pool, block, mtr->start_lsn); } else { ut_ad(block->page.oldest_modification <= mtr->start_lsn); } @@ -99,6 +104,8 @@ buf_flush_recv_note_modification( ib_uint64_t end_lsn) /*!< in: end lsn of the last mtr in the set of mtr's */ { + buf_pool_t* buf_pool = buf_pool_from_block(block); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); @@ -106,9 +113,9 @@ buf_flush_recv_note_modification( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!buf_pool_mutex_own()); - ut_ad(!buf_flush_list_mutex_own()); - ut_ad(buf_flush_order_mutex_own()); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); ut_ad(start_lsn != 0); ut_ad(block->page.newest_modification <= end_lsn); @@ -117,7 +124,8 @@ buf_flush_recv_note_modification( block->page.newest_modification = end_lsn; if (!block->page.oldest_modification) { - buf_flush_insert_sorted_into_flush_list(block, start_lsn); + buf_flush_insert_sorted_into_flush_list( + buf_pool, block, start_lsn); } else { ut_ad(block->page.oldest_modification <= start_lsn); } diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 009430af35b..4fda88ef90c 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -52,8 +52,9 @@ operations need new buffer blocks, and the i/o work done in flushing would be wasted. */ UNIV_INTERN void -buf_LRU_try_free_flushed_blocks(void); -/*==================================*/ +buf_LRU_try_free_flushed_blocks( +/*============================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool instance */ /******************************************************************//** Returns TRUE if less than 25 % of the buffer pool is available. This can be used in heuristics to prevent huge transactions eating up the whole buffer @@ -72,7 +73,7 @@ These are low-level functions #define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ /** Maximum LRU list search length in buf_flush_LRU_recommendation() */ -#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA) +#define BUF_LRU_FREE_SEARCH_LEN(b) (5 + 2 * BUF_READ_AHEAD_AREA(b)) /******************************************************************//** Invalidates all pages belonging to a given tablespace when we are deleting @@ -97,10 +98,10 @@ Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. NOTE: If this function returns BUF_LRU_FREED, it will not temporarily -release buf_pool_mutex. Furthermore, the page frame will no longer be +release buf_pool->mutex. Furthermore, the page frame will no longer be accessible via bpage. -The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and +The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. @return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or @@ -114,7 +115,7 @@ buf_LRU_free_block( compressed page of an uncompressed page */ ibool* buf_pool_mutex_released); /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex + be assigned TRUE if buf_pool->mutex was temporarily released, or NULL */ /******************************************************************//** Try to free a replaceable block. @@ -123,22 +124,26 @@ UNIV_INTERN ibool buf_LRU_search_and_free_block( /*==========================*/ - ulint n_iterations); /*!< in: how many times this has been called - repeatedly without result: a high value means - that we should search farther; if - n_iterations < 10, then we search - n_iterations / 10 * buf_pool->curr_size - pages from the end of the LRU list; if - n_iterations < 5, then we will also search - n_iterations / 5 of the unzip_LRU list. */ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n_iterations); /*!< in: how many times this has + been called repeatedly without + result: a high value means that + we should search farther; if + n_iterations < 10, then we search + n_iterations / 10 * buf_pool->curr_size + pages from the end of the LRU list; if + n_iterations < 5, then we will + also search n_iterations / 5 + of the unzip_LRU list. */ /******************************************************************//** Returns a free block from the buf_pool. The block is taken off the free list. If it is empty, returns NULL. @return a free control block, or NULL if the buf_block->free list is empty */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_only(void); -/*=======================*/ +buf_LRU_get_free_only( +/*==================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ /******************************************************************//** Returns a free block from the buf_pool. The block is taken off the free list. If it is empty, blocks are moved from the end of the @@ -148,8 +153,9 @@ UNIV_INTERN buf_block_t* buf_LRU_get_free_block( /*===================*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ + buf_pool_t* buf_pool, /*!< in: preferred buffer pool */ + ulint zip_size); /*!< in: compressed page size in bytes, + or 0 if uncompressed tablespace */ /******************************************************************//** Puts a block back to the free list. */ @@ -196,7 +202,7 @@ buf_LRU_make_block_old( Updates buf_LRU_old_ratio. @return updated old_pct */ UNIV_INTERN -uint +ulint buf_LRU_old_ratio_update( /*=====================*/ uint old_pct,/*!< in: Reserve this percentage of @@ -232,7 +238,7 @@ buf_LRU_print(void); /** @name Heuristics for detecting index scan @{ */ /** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for -"old" blocks. Protected by buf_pool_mutex. */ +"old" blocks. Protected by buf_pool->mutex. */ extern uint buf_LRU_old_ratio; /** The denominator of buf_LRU_old_ratio. */ #define BUF_LRU_OLD_RATIO_DIV 1024 @@ -278,7 +284,7 @@ Cleared by buf_LRU_stat_update(). */ extern buf_LRU_stat_t buf_LRU_stat_cur; /** Running sum of past values of buf_LRU_stat_cur. -Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */ +Updated by buf_LRU_stat_update(). Protected by buf_pool->mutex. */ extern buf_LRU_stat_t buf_LRU_stat_sum; /********************************************************************//** diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 093750623d6..4a52f9dcd8d 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -124,8 +124,8 @@ buf_read_recv_pages( /** The size in pages of the area which the read-ahead algorithms read if invoked */ -#define BUF_READ_AHEAD_AREA \ - ut_min(64, ut_2_power_up(buf_pool->curr_size / 32)) +#define BUF_READ_AHEAD_AREA(b) \ + ut_min(64, ut_2_power_up((b)->curr_size / 32)) /** @name Modes used in read-ahead @{ */ /** read only pages belonging to the insert buffer tree */ diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index bfae6477135..a2175098704 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -36,6 +36,8 @@ typedef struct buf_chunk_struct buf_chunk_t; typedef struct buf_pool_struct buf_pool_t; /** Buffer pool statistics struct */ typedef struct buf_pool_stat_struct buf_pool_stat_t; +/** Buffer pool buddy statistics struct */ +typedef struct buf_buddy_stat_struct buf_buddy_stat_t; /** A buffer frame. @see page_t */ typedef byte buf_frame_t; diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic index 84c7a004be2..aee27cf9739 100644 --- a/storage/innobase/include/ibuf0ibuf.ic +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -110,7 +110,7 @@ ibuf_should_try( if (ibuf_flush_count % 4 == 0) { - buf_LRU_try_free_flushed_blocks(); + buf_LRU_try_free_flushed_blocks(NULL); } return(TRUE); diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 8fce4ef96bc..1ae94a332e5 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -763,6 +763,15 @@ struct log_struct{ #ifndef UNIV_HOTBACKUP mutex_t mutex; /*!< mutex protecting the log */ #endif /* !UNIV_HOTBACKUP */ + + mutex_t log_flush_order_mutex;/*!< mutex to serialize access to + the flush list when we are putting + dirty blocks in the list. The idea + behind this mutex is to be able + to release log_sys->mutex during + mtr_commit and still ensure that + insertions in the flush_list happen + in the LSN order. */ byte* buf_ptr; /* unaligned log buffer */ byte* buf; /*!< log buffer */ ulint buf_size; /*!< log buffer size in bytes */ @@ -952,6 +961,19 @@ struct log_struct{ #endif /* UNIV_LOG_ARCHIVE */ }; +/** Test if flush order mutex is owned. */ +#define log_flush_order_mutex_own() \ + mutex_own(&log_sys->log_flush_order_mutex) + +/** Acquire the flush order mutex. */ +#define log_flush_order_mutex_enter() do { \ + mutex_enter(&log_sys->log_flush_order_mutex); \ +} while (0) +/** Release the flush order mutex. */ +# define log_flush_order_mutex_exit() do { \ + mutex_exit(&log_sys->log_flush_order_mutex); \ +} while (0) + #ifdef UNIV_LOG_ARCHIVE /** Archiving state @{ */ #define LOG_ARCH_ON 71 diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 74c604124f5..2cec4b919fb 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -142,6 +142,7 @@ extern my_bool srv_use_sys_malloc; extern ibool srv_use_sys_malloc; #endif /* UNIV_HOTBACKUP */ extern ulint srv_buf_pool_size; /*!< requested size in bytes */ +extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */ extern ulint srv_buf_pool_old_size; /*!< previously requested size */ extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */ extern ulint srv_mem_pool_size; diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 69c0382d5b9..4e73bee9108 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -81,13 +81,13 @@ extern mysql_pfs_key_t dict_sys_mutex_key; extern mysql_pfs_key_t file_format_max_mutex_key; extern mysql_pfs_key_t fil_system_mutex_key; extern mysql_pfs_key_t flush_list_mutex_key; -extern mysql_pfs_key_t flush_order_mutex_key; extern mysql_pfs_key_t hash_table_mutex_key; extern mysql_pfs_key_t ibuf_bitmap_mutex_key; extern mysql_pfs_key_t ibuf_mutex_key; extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; extern mysql_pfs_key_t ios_mutex_key; extern mysql_pfs_key_t log_sys_mutex_key; +extern mysql_pfs_key_t log_flush_order_mutex_key; extern mysql_pfs_key_t kernel_mutex_key; # ifdef UNIV_MEM_DEBUG extern mysql_pfs_key_t mem_hash_mutex_key; @@ -661,6 +661,7 @@ or row lock! */ #define SYNC_TRX_LOCK_HEAP 298 #define SYNC_TRX_SYS_HEADER 290 #define SYNC_LOG 170 +#define SYNC_LOG_FLUSH_ORDER 147 #define SYNC_RECV 168 #define SYNC_WORK_QUEUE 162 #define SYNC_SEARCH_SYS_CONF 161 /* for assigning btr_search_enabled */ @@ -671,7 +672,6 @@ or row lock! */ can call routines there! Otherwise the level is SYNC_MEM_HASH. */ #define SYNC_BUF_POOL 150 /* Buffer pool mutex */ -#define SYNC_BUF_FLUSH_ORDER 147 #define SYNC_BUF_BLOCK 146 /* Block mutex */ #define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */ #define SYNC_DOUBLEWRITE 140 diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h index cf41cba4643..57dfb08f41c 100644 --- a/storage/innobase/include/ut0mem.h +++ b/storage/innobase/include/ut0mem.h @@ -113,12 +113,13 @@ ut_test_malloc( ulint n); /*!< in: try to allocate this many bytes */ #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** -Frees a memory block allocated with ut_malloc. */ +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ UNIV_INTERN void ut_free( /*====*/ - void* ptr); /*!< in, own: memory block */ + void* ptr); /*!< in, own: memory block, can be NULL */ #ifndef UNIV_HOTBACKUP /**********************************************************************//** Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h index a35807be442..7902dc91f09 100644 --- a/storage/innobase/include/ut0rbt.h +++ b/storage/innobase/include/ut0rbt.h @@ -1,6 +1,29 @@ -/****************************************************** -Red-Black tree implementation. -(c) 2007 Oracle/Innobase Oy +/***************************************************************************//** + +Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Sun Microsystems, Inc. Those modifications are gratefully acknowledged and +are described briefly in the InnoDB documentation. The contributions by +Sun Microsystems are incorporated with their permission, and subject to the +conditions contained in the file COPYING.Sun_Microsystems. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ +/******************************************************************//** +@file include/ut0rbt.h +Various utilities Created 2007-03-20 Sunny Bains *******************************************************/ @@ -35,7 +58,7 @@ typedef struct ib_rbt_bound_struct ib_rbt_bound_t; typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); typedef int (*ib_rbt_compare)(const void* p1, const void* p2); -/* Red black tree color types */ +/** Red black tree color types */ enum ib_rbt_color_enum { IB_RBT_RED, IB_RBT_BLACK @@ -43,7 +66,7 @@ enum ib_rbt_color_enum { typedef enum ib_rbt_color_enum ib_rbt_color_t; -/* Red black tree node */ +/** Red black tree node */ struct ib_rbt_node_struct { ib_rbt_color_t color; /* color of this node */ @@ -54,7 +77,7 @@ struct ib_rbt_node_struct { char value[1]; /* Data value */ }; -/* Red black tree instance.*/ +/** Red black tree instance.*/ struct ib_rbt_struct { ib_rbt_node_t* nil; /* Black colored node that is used as a sentinel. This is @@ -70,7 +93,7 @@ struct ib_rbt_struct { ulint sizeof_value; /* Sizeof the item in bytes */ }; -/* The result of searching for a key in the tree, this is useful for +/** The result of searching for a key in the tree, this is useful for a speedy lookup and insert if key doesn't exist.*/ struct ib_rbt_bound_struct { const ib_rbt_node_t* @@ -93,14 +116,14 @@ struct ib_rbt_bound_struct { /* Compare a key with the node value (t is tree, k is key, n is node)*/ #define rbt_compare(t, k, n) (t->compare(k, n->value)) -/************************************************************************ +/**********************************************************************//** Free an instance of a red black tree */ UNIV_INTERN void rbt_free( /*=====*/ ib_rbt_t* tree); /*!< in: rb tree to free */ -/************************************************************************ +/**********************************************************************//** Create an instance of a red black tree @return rb tree instance */ UNIV_INTERN @@ -109,7 +132,7 @@ rbt_create( /*=======*/ size_t sizeof_value, /*!< in: size in bytes */ ib_rbt_compare compare); /*!< in: comparator */ -/************************************************************************ +/**********************************************************************//** Delete a node from the red black tree, identified by key */ UNIV_INTERN ibool @@ -118,7 +141,7 @@ rbt_delete( /* in: TRUE on success */ ib_rbt_t* tree, /* in: rb tree */ const void* key); /* in: key to delete */ -/************************************************************************ +/**********************************************************************//** Remove a node from the red black tree, NOTE: This function will not delete the node instance, THAT IS THE CALLERS RESPONSIBILITY. @return the deleted node with the const. */ @@ -132,7 +155,7 @@ rbt_remove_node( is a fudge and declared const because the caller has access only to const nodes.*/ -/************************************************************************ +/**********************************************************************//** Return a node from the red black tree, identified by key, NULL if not found @return node if found else return NULL */ @@ -142,7 +165,7 @@ rbt_lookup( /*=======*/ const ib_rbt_t* tree, /*!< in: rb tree to search */ const void* key); /*!< in: key to lookup */ -/************************************************************************ +/**********************************************************************//** Add data to the red black tree, identified by key (no dups yet!) @return inserted node */ UNIV_INTERN @@ -153,7 +176,7 @@ rbt_insert( const void* key, /*!< in: key for ordering */ const void* value); /*!< in: data that will be copied to the node.*/ -/************************************************************************ +/**********************************************************************//** Add a new node to the tree, useful for data that is pre-sorted. @return appended node */ UNIV_INTERN @@ -164,7 +187,7 @@ rbt_add_node( ib_rbt_bound_t* parent, /*!< in: parent */ const void* value); /*!< in: this value is copied to the node */ -/************************************************************************ +/**********************************************************************//** Return the left most data node in the tree @return left most node */ UNIV_INTERN @@ -172,7 +195,7 @@ const ib_rbt_node_t* rbt_first( /*======*/ const ib_rbt_t* tree); /*!< in: rb tree */ -/************************************************************************ +/**********************************************************************//** Return the right most data node in the tree @return right most node */ UNIV_INTERN @@ -180,7 +203,7 @@ const ib_rbt_node_t* rbt_last( /*=====*/ const ib_rbt_t* tree); /*!< in: rb tree */ -/************************************************************************ +/**********************************************************************//** Return the next node from current. @return successor node to current that is passed in. */ UNIV_INTERN @@ -190,7 +213,7 @@ rbt_next( const ib_rbt_t* tree, /*!< in: rb tree */ const ib_rbt_node_t* /* in: current node */ current); -/************************************************************************ +/**********************************************************************//** Return the prev node from current. @return precedessor node to current that is passed in */ UNIV_INTERN @@ -200,7 +223,7 @@ rbt_prev( const ib_rbt_t* tree, /*!< in: rb tree */ const ib_rbt_node_t* /* in: current node */ current); -/************************************************************************ +/**********************************************************************//** Find the node that has the lowest key that is >= key. @return node that satisfies the lower bound constraint or NULL */ UNIV_INTERN @@ -209,7 +232,7 @@ rbt_lower_bound( /*============*/ const ib_rbt_t* tree, /*!< in: rb tree */ const void* key); /*!< in: key to search */ -/************************************************************************ +/**********************************************************************//** Find the node that has the greatest key that is <= key. @return node that satisifies the upper bound constraint or NULL */ UNIV_INTERN @@ -218,7 +241,7 @@ rbt_upper_bound( /*============*/ const ib_rbt_t* tree, /*!< in: rb tree */ const void* key); /*!< in: key to search */ -/************************************************************************ +/**********************************************************************//** Search for the key, a node will be retuned in parent.last, whether it was found or not. If not found then parent.last will contain the parent node for the possibly new key otherwise the matching node. @@ -230,7 +253,7 @@ rbt_search( const ib_rbt_t* tree, /*!< in: rb tree */ ib_rbt_bound_t* parent, /*!< in: search bounds */ const void* key); /*!< in: key to search */ -/************************************************************************ +/**********************************************************************//** Search for the key, a node will be retuned in parent.last, whether it was found or not. If not found then parent.last will contain the parent node for the possibly new key otherwise the matching node. @@ -243,14 +266,14 @@ rbt_search_cmp( ib_rbt_bound_t* parent, /*!< in: search bounds */ const void* key, /*!< in: key to search */ ib_rbt_compare compare); /*!< in: comparator */ -/************************************************************************ +/**********************************************************************//** Clear the tree, deletes (and free's) all the nodes. */ UNIV_INTERN void rbt_clear( /*======*/ ib_rbt_t* tree); /*!< in: rb tree */ -/************************************************************************ +/**********************************************************************//** Merge the node from dst into src. Return the number of nodes merged. @return no. of recs merged */ UNIV_INTERN @@ -259,7 +282,7 @@ rbt_merge_uniq( /*===========*/ ib_rbt_t* dst, /*!< in: dst rb tree */ const ib_rbt_t* src); /*!< in: src rb tree */ -/************************************************************************ +/**********************************************************************//** Merge the node from dst into src. Return the number of nodes merged. Delete the nodes from src after copying node to dst. As a side effect the duplicates will be left untouched in the src, since we don't support @@ -272,7 +295,7 @@ rbt_merge_uniq_destructive( /*=======================*/ ib_rbt_t* dst, /*!< in: dst rb tree */ ib_rbt_t* src); /*!< in: src rb tree */ -/************************************************************************ +/**********************************************************************//** Verify the integrity of the RB tree. For debugging. 0 failure else height of tree (in count of black nodes). @return TRUE if OK FALSE if tree invalid. */ @@ -281,7 +304,7 @@ ibool rbt_validate( /*=========*/ const ib_rbt_t* tree); /*!< in: tree to validate */ -/************************************************************************ +/**********************************************************************//** Iterate over the tree in depth first order. */ UNIV_INTERN void diff --git a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.c index 04ced18bc69..e450307d773 100644 --- a/storage/innobase/log/log0log.c +++ b/storage/innobase/log/log0log.c @@ -91,6 +91,7 @@ UNIV_INTERN mysql_pfs_key_t archive_lock_key; #ifdef UNIV_PFS_MUTEX UNIV_INTERN mysql_pfs_key_t log_sys_mutex_key; +UNIV_INTERN mysql_pfs_key_t log_flush_order_mutex_key; #endif /* UNIV_PFS_MUTEX */ #ifdef UNIV_DEBUG @@ -769,6 +770,10 @@ log_init(void) mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG); + mutex_create(log_flush_order_mutex_key, + &log_sys->log_flush_order_mutex, + SYNC_LOG_FLUSH_ORDER); + mutex_enter(&(log_sys->mutex)); /* Start the lsn from one log block from zero: this way every @@ -1650,10 +1655,10 @@ log_preflush_pool_modified_pages( recv_apply_hashed_log_recs(TRUE); } - n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest); + n_pages = buf_flush_list(ULINT_MAX, new_oldest); if (sync) { - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); } if (n_pages == ULINT_UNDEFINED) { @@ -3285,9 +3290,9 @@ log_check_log_recs( ut_memcpy(scan_buf, start, end - start); - recv_scan_log_recs((buf_pool->curr_size - - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, - FALSE, scan_buf, end - start, + recv_scan_log_recs((buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) + * UNIV_PAGE_SIZE, FALSE, scan_buf, end - start, ut_uint64_align_down(buf_start_lsn, OS_FILE_LOG_BLOCK_SIZE), &contiguous_lsn, &scanned_lsn); diff --git a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.c index 0e96dbbb960..04c06f62df5 100644 --- a/storage/innobase/log/log0recv.c +++ b/storage/innobase/log/log0recv.c @@ -1659,11 +1659,15 @@ recv_recover_page_func( #ifndef UNIV_HOTBACKUP if (modification_to_page) { + buf_pool_t* buf_pool; + ut_a(block); - buf_flush_order_mutex_enter(); + buf_pool = buf_pool_from_block(block); + + log_flush_order_mutex_enter(); buf_flush_recv_note_modification(block, start_lsn, end_lsn); - buf_flush_order_mutex_exit(); + log_flush_order_mutex_exit(); } #endif /* !UNIV_HOTBACKUP */ @@ -1848,11 +1852,10 @@ loop: mutex_exit(&(recv_sys->mutex)); mutex_exit(&(log_sys->mutex)); - n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, - IB_ULONGLONG_MAX); - ut_a(n_pages != ULINT_UNDEFINED); - - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + n_pages = buf_flush_list(ULINT_MAX, IB_ULONGLONG_MAX); + ut_a(n_pages != ULINT_UNDEFINED); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); buf_pool_invalidate(); @@ -2762,8 +2765,8 @@ recv_scan_log_recs( recv_parse_log_recs(store_to_hash); #ifndef UNIV_HOTBACKUP - if (store_to_hash && mem_heap_get_size(recv_sys->heap) - > available_memory) { + if (store_to_hash + && mem_heap_get_size(recv_sys->heap) > available_memory) { /* Hash table of log records has grown too big: empty it; FALSE means no ibuf operations @@ -2815,8 +2818,10 @@ recv_group_scan_log_recs( group, start_lsn, end_lsn); finished = recv_scan_log_recs( - (buf_pool->curr_size - recv_n_pool_free_frames) - * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE, + (buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) + * UNIV_PAGE_SIZE, + TRUE, log_sys->buf, RECV_SCAN_SIZE, start_lsn, contiguous_lsn, group_scanned_lsn); start_lsn = end_lsn; } @@ -3497,6 +3502,7 @@ recv_reset_log_files_for_backup( #endif /* UNIV_HOTBACKUP */ #ifdef UNIV_LOG_ARCHIVE +/* Dead code */ /******************************************************//** Reads from the archive of a log group and performs recovery. @return TRUE if no more complete consistent archive files */ @@ -3662,7 +3668,8 @@ ask_again: read_offset % UNIV_PAGE_SIZE, len, buf, NULL); ret = recv_scan_log_recs( - (buf_pool->n_frames - recv_n_pool_free_frames) + (buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, &dummy_lsn, &scanned_lsn); diff --git a/storage/innobase/mem/mem0mem.c b/storage/innobase/mem/mem0mem.c index c0ce8a3e1ac..3e0e31c0891 100644 --- a/storage/innobase/mem/mem0mem.c +++ b/storage/innobase/mem/mem0mem.c @@ -347,7 +347,7 @@ mem_heap_create_block( return(NULL); } } else { - buf_block = buf_block_alloc(0); + buf_block = buf_block_alloc(NULL, 0); } block = (mem_block_t*) buf_block->frame; diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.c index 78618564ef1..b01462f6b9b 100644 --- a/storage/innobase/mtr/mtr0mtr.c +++ b/storage/innobase/mtr/mtr0mtr.c @@ -120,10 +120,14 @@ mtr_memo_slot_note_modification( ut_ad(mtr); ut_ad(mtr->magic_n == MTR_MAGIC_N); ut_ad(mtr->modifications); - ut_ad(buf_flush_order_mutex_own()); if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) { - buf_flush_note_modification((buf_block_t*) slot->object, mtr); + buf_block_t* block = (buf_block_t*) slot->object; + +#ifdef UNIV_DEBUG + ut_ad(log_flush_order_mutex_own()); +#endif /* UNIV_DEBUG */ + buf_flush_note_modification(block, mtr); } } @@ -221,16 +225,18 @@ mtr_log_reserve_and_write( mtr->end_lsn = log_close(); func_exit: - buf_flush_order_mutex_enter(); + log_flush_order_mutex_enter(); /* It is now safe to release the log mutex because the flush_order mutex will ensure that we are the first one to insert into the flush list. */ log_release(); + if (mtr->modifications) { mtr_memo_note_modifications(mtr); } - buf_flush_order_mutex_exit(); + + log_flush_order_mutex_exit(); } #endif /* !UNIV_HOTBACKUP */ @@ -324,7 +330,7 @@ mtr_memo_release( offset = dyn_array_get_data_size(memo); - buf_flush_order_mutex_enter(); + log_flush_order_mutex_enter(); while (offset > 0) { offset -= sizeof(mtr_memo_slot_t); @@ -340,7 +346,7 @@ mtr_memo_release( break; } } - buf_flush_order_mutex_exit(); + log_flush_order_mutex_exit(); } #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/page/page0zip.c b/storage/innobase/page/page0zip.c index aa5e39ff04a..cc7ec2f404c 100644 --- a/storage/innobase/page/page0zip.c +++ b/storage/innobase/page/page0zip.c @@ -4416,6 +4416,7 @@ page_zip_reorganize( dict_index_t* index, /*!< in: index of the B-tree node */ mtr_t* mtr) /*!< in: mini-transaction */ { + buf_pool_t* buf_pool = buf_pool_from_block(block); page_zip_des_t* page_zip = buf_block_get_page_zip(block); page_t* page = buf_block_get_frame(block); buf_block_t* temp_block; @@ -4433,7 +4434,7 @@ page_zip_reorganize( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(buf_pool, 0); btr_search_drop_page_hash_index(block); block->check_index_page_at_flush = TRUE; #else /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 838df292bfc..560dafa6138 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -195,6 +195,8 @@ UNIV_INTERN const byte* srv_latin1_ordering; UNIV_INTERN my_bool srv_use_sys_malloc = TRUE; /* requested size in kilobytes */ UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX; +/* requested number of buffer pool instances */ +UNIV_INTERN ulint srv_buf_pool_instances = 1; /* previously requested size */ UNIV_INTERN ulint srv_buf_pool_old_size; /* current size in kilobytes */ @@ -1700,7 +1702,7 @@ srv_refresh_innodb_monitor_stats(void) log_refresh_stats(); - buf_refresh_io_stats(); + buf_refresh_io_stats_all(); srv_n_rows_inserted_old = srv_n_rows_inserted; srv_n_rows_updated_old = srv_n_rows_updated; @@ -1911,6 +1913,14 @@ void srv_export_innodb_status(void) /*==========================*/ { + buf_pool_stat_t stat; + ulint LRU_len; + ulint free_len; + ulint flush_list_len; + + buf_get_total_stat(&stat); + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + mutex_enter(&srv_innodb_monitor_mutex); export_vars.innodb_data_pending_reads @@ -1925,31 +1935,26 @@ srv_export_innodb_status(void) export_vars.innodb_data_reads = os_n_file_reads; export_vars.innodb_data_writes = os_n_file_writes; export_vars.innodb_data_written = srv_data_written; - export_vars.innodb_buffer_pool_read_requests = buf_pool->stat.n_page_gets; + export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets; export_vars.innodb_buffer_pool_write_requests = srv_buf_pool_write_requests; export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free; export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed; export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads; export_vars.innodb_buffer_pool_read_ahead - = buf_pool->stat.n_ra_pages_read; + = stat.n_ra_pages_read; export_vars.innodb_buffer_pool_read_ahead_evicted - = buf_pool->stat.n_ra_pages_evicted; - export_vars.innodb_buffer_pool_pages_data - = UT_LIST_GET_LEN(buf_pool->LRU); - export_vars.innodb_buffer_pool_pages_dirty - = UT_LIST_GET_LEN(buf_pool->flush_list); - export_vars.innodb_buffer_pool_pages_free - = UT_LIST_GET_LEN(buf_pool->free); + = stat.n_ra_pages_evicted; + export_vars.innodb_buffer_pool_pages_data = LRU_len; + export_vars.innodb_buffer_pool_pages_dirty = flush_list_len; + export_vars.innodb_buffer_pool_pages_free = free_len; #ifdef UNIV_DEBUG export_vars.innodb_buffer_pool_pages_latched = buf_get_latched_pages_number(); #endif /* UNIV_DEBUG */ - export_vars.innodb_buffer_pool_pages_total = buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_total = buf_pool_get_curr_size(); - export_vars.innodb_buffer_pool_pages_misc = buf_pool->curr_size - - UT_LIST_GET_LEN(buf_pool->LRU) - - UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_buffer_pool_pages_misc = buf_pool_get_curr_size(); #ifdef HAVE_ATOMIC_BUILTINS export_vars.innodb_have_atomic_builtins = 1; #else @@ -1965,9 +1970,9 @@ srv_export_innodb_status(void) export_vars.innodb_log_writes = srv_log_writes; export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written; export_vars.innodb_dblwr_writes = srv_dblwr_writes; - export_vars.innodb_pages_created = buf_pool->stat.n_pages_created; - export_vars.innodb_pages_read = buf_pool->stat.n_pages_read; - export_vars.innodb_pages_written = buf_pool->stat.n_pages_written; + export_vars.innodb_pages_created = stat.n_pages_created; + export_vars.innodb_pages_read = stat.n_pages_read; + export_vars.innodb_pages_written = stat.n_pages_written; export_vars.innodb_row_lock_waits = srv_n_lock_wait_count; export_vars.innodb_row_lock_current_waits = srv_n_lock_wait_current_count; @@ -2279,7 +2284,7 @@ srv_error_monitor_thread( #endif #ifdef UNIV_PFS_THREAD - pfs_register_thread(srv_error_monitor_thread_key); + pfs_register_thread(srv_error_monitor_thread_key); #endif loop: @@ -2503,6 +2508,7 @@ srv_master_thread( /*!< in: a dummy parameter required by os_thread_create */ { + buf_pool_stat_t buf_stat; os_event_t event; ulint old_activity_count; ulint n_pages_purged = 0; @@ -2544,8 +2550,9 @@ loop: srv_main_thread_op_info = "reserving kernel mutex"; - n_ios_very_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read - + buf_pool->stat.n_pages_written; + buf_get_total_stat(&buf_stat); + n_ios_very_old = log_sys->n_log_ios + buf_stat.n_pages_read + + buf_stat.n_pages_written; mutex_enter(&kernel_mutex); /* Store the user activity counter at the start of this loop */ @@ -2566,8 +2573,12 @@ loop: for (i = 0; i < 10; i++) { ulint cur_time = ut_time_ms(); - n_ios_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read - + buf_pool->stat.n_pages_written; + + buf_get_total_stat(&buf_stat); + + n_ios_old = log_sys->n_log_ios + buf_stat.n_pages_read + + buf_stat.n_pages_written; + srv_main_thread_op_info = "sleeping"; srv_main_1_second_loops++; @@ -2607,13 +2618,14 @@ loop: log_free_check(); /* If i/os during one second sleep were less than 5% of - capacity, we assume that there is free disk i/o capacity - available, and it makes sense to do an insert buffer merge. */ + capacity, we assume that there is free disk i/o capacity + available, and it makes sense to do an insert buffer merge. */ + buf_get_total_stat(&buf_stat); n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; - n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read - + buf_pool->stat.n_pages_written; + n_ios = log_sys->n_log_ios + buf_stat.n_pages_read + + buf_stat.n_pages_written; if (n_pend_ios < SRV_PEND_IO_THRESHOLD && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) { srv_main_thread_op_info = "doing insert buffer merge"; @@ -2631,9 +2643,8 @@ loop: srv_main_thread_op_info = "flushing buffer pool pages"; - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, - PCT_IO(100), - IB_ULONGLONG_MAX); + n_pages_flushed = buf_flush_list( + PCT_IO(100), IB_ULONGLONG_MAX); } else if (srv_adaptive_flushing) { @@ -2647,8 +2658,7 @@ loop: "flushing buffer pool pages"; n_flush = ut_min(PCT_IO(100), n_flush); n_pages_flushed = - buf_flush_batch( - BUF_FLUSH_LIST, + buf_flush_list( n_flush, IB_ULONGLONG_MAX); } @@ -2680,17 +2690,17 @@ loop: loop above requests writes for that case. The writes done here are not required, and may be disabled. */ + buf_get_total_stat(&buf_stat); n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; - n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read - + buf_pool->stat.n_pages_written; + n_ios = log_sys->n_log_ios + buf_stat.n_pages_read + + buf_stat.n_pages_written; srv_main_10_second_loops++; if (n_pend_ios < SRV_PEND_IO_THRESHOLD && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) { srv_main_thread_op_info = "flushing buffer pool pages"; - buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), - IB_ULONGLONG_MAX); + buf_flush_list(PCT_IO(100), IB_ULONGLONG_MAX); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); @@ -2705,8 +2715,6 @@ loop: /* Flush logs if needed */ srv_sync_log_buffer_in_background(); - /* We run a full purge every 10 seconds, even if the server - were active */ if (srv_n_purge_threads == 0) { srv_main_thread_op_info = "master purging"; @@ -2728,17 +2736,15 @@ loop: (> 70 %), we assume we can afford reserving the disk(s) for the time it requires to flush 100 pages */ - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, - PCT_IO(100), - IB_ULONGLONG_MAX); + n_pages_flushed = buf_flush_list( + PCT_IO(100), IB_ULONGLONG_MAX); } else { /* Otherwise, we only flush a small number of pages so that we do not unnecessarily use much disk i/o capacity from other work */ - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, - PCT_IO(10), - IB_ULONGLONG_MAX); + n_pages_flushed = buf_flush_list( + PCT_IO(10), IB_ULONGLONG_MAX); } srv_main_thread_op_info = "making checkpoint"; @@ -2807,7 +2813,7 @@ background_loop: } else { /* This should do an amount of IO similar to the number of dirty pages that will be flushed in the call to - buf_flush_batch below. Otherwise, the system favors + buf_flush_list below. Otherwise, the system favors clean pages over cleanup throughput. */ n_bytes_merged = ibuf_contract_for_n_pages(FALSE, PCT_IO(100)); @@ -2826,9 +2832,8 @@ flush_loop: srv_main_thread_op_info = "flushing buffer pool pages"; srv_main_flush_loops++; if (srv_fast_shutdown < 2) { - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, - PCT_IO(100), - IB_ULONGLONG_MAX); + n_pages_flushed = buf_flush_list( + PCT_IO(100), IB_ULONGLONG_MAX); } else { /* In the fastest shutdown we do not flush the buffer pool to data files: we set n_pages_flushed to 0 artificially. */ @@ -2846,7 +2851,7 @@ flush_loop: mutex_exit(&kernel_mutex); srv_main_thread_op_info = "waiting for buffer pool flush to end"; - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); /* Flush logs if needed */ srv_sync_log_buffer_in_background(); diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c index a257fd32aab..4c9851e953b 100644 --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -1016,7 +1016,6 @@ int innobase_start_or_create_for_mysql(void) /*====================================*/ { - buf_pool_t* ret; ibool create_new_db; ibool log_file_created; ibool log_created = FALSE; @@ -1241,13 +1240,16 @@ innobase_start_or_create_for_mysql(void) #else if (srv_buf_pool_size >= 1000 * 1024 * 1024) { /* If buffer pool is less than 1000 MB, - assume fewer threads. */ + assume fewer threads. Also use only one + buffer pool instance */ srv_max_n_threads = 50000; } else if (srv_buf_pool_size >= 8 * 1024 * 1024) { + srv_buf_pool_instances = 1; srv_max_n_threads = 10000; } else { + srv_buf_pool_instances = 1; srv_max_n_threads = 1000; /* saves several MB of memory, especially in 64-bit computers */ @@ -1331,9 +1333,9 @@ innobase_start_or_create_for_mysql(void) fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files); - ret = buf_pool_init(); + err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances); - if (ret == NULL) { + if (err != DB_SUCCESS) { fprintf(stderr, "InnoDB: Fatal error: cannot allocate the memory" " for the buffer pool\n"); @@ -2089,7 +2091,7 @@ innobase_shutdown_for_mysql(void) pars_lexer_close(); log_mem_free(); - buf_pool_free(); + buf_pool_free(srv_buf_pool_instances); ut_free_all_mem(); mem_close(); diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index b9b83adba00..235f733382d 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -1157,14 +1157,12 @@ sync_thread_add_level( case SYNC_RECV: case SYNC_WORK_QUEUE: case SYNC_LOG: + case SYNC_LOG_FLUSH_ORDER: case SYNC_THR_LOCAL: case SYNC_ANY_LATCH: case SYNC_TRX_SYS_HEADER: case SYNC_FILE_FORMAT_TAG: case SYNC_DOUBLEWRITE: - case SYNC_BUF_FLUSH_LIST: - case SYNC_BUF_FLUSH_ORDER: - case SYNC_BUF_POOL: case SYNC_SEARCH_SYS: case SYNC_SEARCH_SYS_CONF: case SYNC_TRX_LOCK_HEAP: @@ -1186,6 +1184,18 @@ sync_thread_add_level( ut_error; } break; + case SYNC_BUF_FLUSH_LIST: + case SYNC_BUF_POOL: + /* We can have multiple mutexes of this type therefore we + can only check whether the greater than condition holds. */ + if (!sync_thread_levels_g(array, level-1, TRUE)) { + fprintf(stderr, + "InnoDB: sync_thread_levels_g(array, %lu)" + " does not hold!\n", level-1); + ut_error; + } + break; + case SYNC_BUF_BLOCK: /* Either the thread must own the buffer pool mutex (buf_pool_mutex), or it is allowed to latch only ONE diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c index a47fc28c199..cf7fe85c3d1 100644 --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -764,7 +764,6 @@ trx_commit_off_kernel( if (undo) { mutex_enter(&kernel_mutex); trx->no = trx_sys_get_new_trx_no(); - mutex_exit(&kernel_mutex); /* It is not necessary to obtain trx->undo_mutex here diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.c index 3bb1b1cdf6c..eb5112c4d31 100644 --- a/storage/innobase/trx/trx0undo.c +++ b/storage/innobase/trx/trx0undo.c @@ -1938,7 +1938,8 @@ trx_undo_update_cleanup( UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); } else { - ut_ad(undo->state == TRX_UNDO_TO_PURGE); + ut_ad(undo->state == TRX_UNDO_TO_PURGE + || undo->state == TRX_UNDO_TO_FREE); trx_undo_mem_free(undo); } diff --git a/storage/innobase/ut/ut0mem.c b/storage/innobase/ut/ut0mem.c index 35a325b9ccd..f2baab67f09 100644 --- a/storage/innobase/ut/ut0mem.c +++ b/storage/innobase/ut/ut0mem.c @@ -290,17 +290,20 @@ ut_test_malloc( #endif /* !UNIV_HOTBACKUP */ /**********************************************************************//** -Frees a memory block allocated with ut_malloc. */ +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ UNIV_INTERN void ut_free( /*====*/ - void* ptr) /*!< in, own: memory block */ + void* ptr) /*!< in, own: memory block, can be NULL */ { #ifndef UNIV_HOTBACKUP ut_mem_block_t* block; - if (UNIV_LIKELY(srv_use_sys_malloc)) { + if (ptr == NULL) { + return; + } else if (UNIV_LIKELY(srv_use_sys_malloc)) { free(ptr); return; } diff --git a/storage/innobase/ut/ut0rbt.c b/storage/innobase/ut/ut0rbt.c index 3279307308f..3d7cfa7636f 100644 --- a/storage/innobase/ut/ut0rbt.c +++ b/storage/innobase/ut/ut0rbt.c @@ -1,4 +1,27 @@ -/********************************************************************** +/***************************************************************************//** + +Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Sun Microsystems, Inc. Those modifications are gratefully acknowledged and +are described briefly in the InnoDB documentation. The contributions by +Sun Microsystems are incorporated with their permission, and subject to the +conditions contained in the file COPYING.Sun_Microsystems. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ +/********************************************************************//** Red-Black tree implementation (c) 2007 Oracle/Innobase Oy @@ -8,7 +31,7 @@ Created 2007-03-20 Sunny Bains #include "ut0rbt.h" -/************************************************************************ +/**********************************************************************//** Definition of a red-black tree ============================== @@ -34,7 +57,7 @@ red-black properties: #define ROOT(t) (t->root->left) #define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1) -/************************************************************************ +/**********************************************************************//** Print out the sub-tree recursively. */ static void @@ -52,7 +75,7 @@ rbt_print_subtree( } } -/************************************************************************ +/**********************************************************************//** Verify that the keys are in order. @return TRUE of OK. FALSE if not ordered */ static @@ -77,7 +100,7 @@ rbt_check_ordering( return(TRUE); } -/************************************************************************ +/**********************************************************************//** Check that every path from the root to the leaves has the same count. Count is expressed in the number of black nodes. @return 0 on failure else black height of the subtree */ @@ -125,7 +148,7 @@ rbt_count_black_nodes( return(result); } -/************************************************************************ +/**********************************************************************//** Turn the node's right child's left sub-tree into node's right sub-tree. This will also make node's right child it's parent. */ static @@ -161,7 +184,7 @@ rbt_rotate_left( node->parent = right; } -/************************************************************************ +/**********************************************************************//** Turn the node's left child's right sub-tree into node's left sub-tree. This also make node's left child it's parent. */ static @@ -197,7 +220,7 @@ rbt_rotate_right( node->parent = left; } -/************************************************************************ +/**********************************************************************//** Append a node to the tree. */ static ib_rbt_node_t* @@ -224,7 +247,7 @@ rbt_tree_add_child( return(node); } -/************************************************************************ +/**********************************************************************//** Generic binary tree insert */ static ib_rbt_node_t* @@ -260,7 +283,7 @@ rbt_tree_insert( return(node); } -/************************************************************************ +/**********************************************************************//** Balance a tree after inserting a node. */ static void @@ -350,7 +373,7 @@ rbt_balance_tree( ROOT(tree)->color = IB_RBT_BLACK; } -/************************************************************************ +/**********************************************************************//** Find the given node's successor. @return successor node or NULL if no successor */ static @@ -390,7 +413,7 @@ rbt_find_successor( return(next); } -/************************************************************************ +/**********************************************************************//** Find the given node's precedecessor. @return predecessor node or NULL if no predecesor */ static @@ -430,7 +453,7 @@ rbt_find_predecessor( return(prev); } -/************************************************************************ +/**********************************************************************//** Replace node with child. After applying transformations eject becomes an orphan. */ static @@ -454,7 +477,7 @@ rbt_eject_node( node->parent = eject->parent; } -/************************************************************************ +/**********************************************************************//** Replace a node with another node. */ static void @@ -481,7 +504,7 @@ rbt_replace_node( replace->color = color; } -/************************************************************************ +/**********************************************************************//** Detach node from the tree replacing it with one of it's children. @return the child node that now occupies the position of the detached node */ static @@ -524,7 +547,7 @@ rbt_detach_node( return(child); } -/************************************************************************ +/**********************************************************************//** Rebalance the right sub-tree after deletion. @return node to rebalance if more rebalancing required else NULL */ static @@ -584,7 +607,7 @@ rbt_balance_right( return(node); } -/************************************************************************ +/**********************************************************************//** Rebalance the left sub-tree after deletion. @return node to rebalance if more rebalancing required else NULL */ static @@ -644,7 +667,7 @@ rbt_balance_left( return(node); } -/************************************************************************ +/**********************************************************************//** Delete the node and rebalance the tree if necessary */ static void @@ -696,7 +719,7 @@ rbt_remove_node_and_rebalance( --tree->n_nodes; } -/************************************************************************ +/**********************************************************************//** Recursively free the nodes. */ static void @@ -713,7 +736,7 @@ rbt_free_node( } } -/************************************************************************ +/**********************************************************************//** Free all the nodes and free the tree. */ UNIV_INTERN void @@ -726,7 +749,7 @@ rbt_free( ut_free(tree); } -/************************************************************************ +/**********************************************************************//** Create an instance of a red black tree. @return an empty rb tree */ UNIV_INTERN @@ -764,7 +787,7 @@ rbt_create( return(tree); } -/************************************************************************ +/**********************************************************************//** Generic insert of a value in the rb tree. @return inserted node */ UNIV_INTERN @@ -793,7 +816,7 @@ rbt_insert( return(node); } -/************************************************************************ +/**********************************************************************//** Add a new node to the tree, useful for data that is pre-sorted. @return appended node */ UNIV_INTERN @@ -831,7 +854,7 @@ rbt_add_node( return(node); } -/************************************************************************ +/**********************************************************************//** Find a matching node in the rb tree. @return NULL if not found else the node where key was found */ UNIV_INTERN @@ -859,7 +882,7 @@ rbt_lookup( return(current != tree->nil ? current : NULL); } -/************************************************************************ +/**********************************************************************//** Delete a node indentified by key. @return TRUE if success FALSE if not found */ UNIV_INTERN @@ -882,7 +905,7 @@ rbt_delete( return(deleted); } -/************************************************************************ +/**********************************************************************//** Remove a node from the rb tree, the node is not free'd, that is the callers responsibility. @return deleted node but without the const */ @@ -906,7 +929,7 @@ rbt_remove_node( return((ib_rbt_node_t*) const_node); } -/************************************************************************ +/**********************************************************************//** Find the node that has the lowest key that is >= key. @return node satisfying the lower bound constraint or NULL */ UNIV_INTERN @@ -940,7 +963,7 @@ rbt_lower_bound( return(lb_node); } -/************************************************************************ +/**********************************************************************//** Find the node that has the greatest key that is <= key. @return node satisfying the upper bound constraint or NULL */ UNIV_INTERN @@ -974,7 +997,7 @@ rbt_upper_bound( return(ub_node); } -/************************************************************************ +/**********************************************************************//** Find the node that has the greatest key that is <= key. @return value of result */ UNIV_INTERN @@ -1008,7 +1031,7 @@ rbt_search( return(parent->result); } -/************************************************************************ +/**********************************************************************//** Find the node that has the greatest key that is <= key. But use the supplied comparison function. @return value of result */ @@ -1044,7 +1067,7 @@ rbt_search_cmp( return(parent->result); } -/************************************************************************ +/**********************************************************************//** Return the left most node in the tree. */ UNIV_INTERN const ib_rbt_node_t* @@ -1064,7 +1087,7 @@ rbt_first( return(first); } -/************************************************************************ +/**********************************************************************//** Return the right most node in the tree. @return the rightmost node or NULL */ UNIV_INTERN @@ -1084,7 +1107,7 @@ rbt_last( return(last); } -/************************************************************************ +/**********************************************************************//** Return the next node. @return node next from current */ UNIV_INTERN @@ -1097,7 +1120,7 @@ rbt_next( return(current ? rbt_find_successor(tree, current) : NULL); } -/************************************************************************ +/**********************************************************************//** Return the previous node. @return node prev from current */ UNIV_INTERN @@ -1110,7 +1133,7 @@ rbt_prev( return(current ? rbt_find_predecessor(tree, current) : NULL); } -/************************************************************************ +/**********************************************************************//** Reset the tree. Delete all the nodes. */ UNIV_INTERN void @@ -1124,7 +1147,7 @@ rbt_clear( tree->root->left = tree->root->right = tree->nil; } -/************************************************************************ +/**********************************************************************//** Merge the node from dst into src. Return the number of nodes merged. @return no. of recs merged */ UNIV_INTERN @@ -1153,7 +1176,7 @@ rbt_merge_uniq( return(n_merged); } -/************************************************************************ +/**********************************************************************//** Merge the node from dst into src. Return the number of nodes merged. Delete the nodes from src after copying node to dst. As a side effect the duplicates will be left untouched in the src. @@ -1201,7 +1224,7 @@ rbt_merge_uniq_destructive( return(rbt_size(dst) - old_size); } -/************************************************************************ +/**********************************************************************//** Check that every path from the root to the leaves has the same count and the tree nodes are in order. @return TRUE if OK FALSE otherwise */ @@ -1218,7 +1241,7 @@ rbt_validate( return(FALSE); } -/************************************************************************ +/**********************************************************************//** Iterate over the tree in depth first order. */ UNIV_INTERN void |