diff options
author | Oleksandr Byelkin <sanja@mariadb.com> | 2020-06-10 21:14:13 +0300 |
---|---|---|
committer | Monty <monty@mariadb.org> | 2020-06-14 19:39:43 +0300 |
commit | d7a9cdc627e59b3dbb91c8878ab74d62e0fdb3ce (patch) | |
tree | 2475063d16dcdf591792dcbe9791afa23dc09a9d /storage | |
parent | b3179b7e3294b8e6869283b69d1ab811b7a86944 (diff) | |
download | mariadb-git-d7a9cdc627e59b3dbb91c8878ab74d62e0fdb3ce.tar.gz |
Fixed hang in Aria page cache with concurrent SELECT
MDEV-20302 Server hangs upon concurrent SELECT from partitioned S3
Diffstat (limited to 'storage')
-rw-r--r-- | storage/maria/ma_pagecache.c | 124 |
1 files changed, 90 insertions, 34 deletions
diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index b3bbee4cc22..a1796f52819 100644 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -130,7 +130,7 @@ my_bool my_disable_flush_pagecache_blocks= 0; #define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */ #define COND_FOR_SAVED 1 /* queue of thread waiting for flush */ #define COND_FOR_WRLOCK 2 /* queue of write lock */ -#define COND_FOR_BIG_BLOCK 3 /* queue of waiting fo big block read */ +#define COND_FOR_BIG_BLOCK 3 /* queue of waiting for big block read */ #define COND_SIZE 4 /* number of COND_* queues */ typedef mysql_cond_t KEYCACHE_CONDVAR; @@ -178,7 +178,9 @@ struct st_pagecache_hash_link #define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */ #define PCBLOCK_DIRECT_W 64 /* possible direct write to the block */ #define PCBLOCK_DEL_WRITE 128 /* should be written on delete */ -#define PCBLOCK_BIG_READ 256 /* the first block of the big read in progress */ +#define PCBLOCK_BIG_READ 256 /* the first block of the big read in progress + or not first block which other thread wait + to be read in big read operation */ /* page status, returned by find_block */ #define PAGE_READ 0 @@ -2770,7 +2772,7 @@ retry: */ #ifdef WITH_S3_STORAGE_ENGINE -static my_bool read_big_block(PAGECACHE *pagecache, +static void read_big_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) { int page_st; @@ -2809,25 +2811,26 @@ static my_bool read_big_block(PAGECACHE *pagecache, if (block_to_read->status & PCBLOCK_ERROR) { /* We get first block with an error so all operation failed */ - block->status|= PCBLOCK_ERROR; - block->error= block_to_read->error; - DBUG_RETURN(FALSE); // no retry + goto error; } - // only primary request here, PAGE_WAIT_TO_BE_READ is impossible - DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ); if (block_to_read->status & PCBLOCK_BIG_READ) { + /* + Other thread is reading the big block so we will wait when it will + have read our block for us + */ struct st_my_thread_var *thread; + DBUG_ASSERT(page_st == PAGE_WAIT_TO_BE_READ); DBUG_ASSERT(page_st != PAGE_TO_BE_READ); + block->status|= PCBLOCK_BIG_READ; // will be read by other thread /* Block read failed because somebody else is reading the first block (and all other blocks part of this one). Wait until block is available. */ - unreg_request(pagecache, block, 1); thread= my_thread_var; /* Put the request into a queue and wait until it can be processed */ - wqueue_add_to_queue(&block->wqueue[COND_FOR_BIG_BLOCK], thread); + wqueue_add_to_queue(&block_to_read->wqueue[COND_FOR_BIG_BLOCK], thread); do { DBUG_PRINT("wait", @@ -2837,7 +2840,21 @@ static my_bool read_big_block(PAGECACHE *pagecache, &pagecache->cache_lock); } while (thread->next); - DBUG_RETURN(TRUE); + // page shoud be read by other thread + DBUG_ASSERT(block->status & PCBLOCK_READ || + block->status & PCBLOCK_ERROR); + DBUG_ASSERT(block->status & PCBLOCK_BIG_READ); + block->status&= ~PCBLOCK_BIG_READ; + // all is read => lets finish nice + DBUG_ASSERT(block_to_read != block); + remove_reader(block_to_read); + unreg_request(pagecache, block_to_read, 1); + DBUG_VOID_RETURN; + } + else + { + // only primary request here, PAGE_WAIT_TO_BE_READ is impossible + DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ); } } else @@ -2863,18 +2880,9 @@ static my_bool read_big_block(PAGECACHE *pagecache, { pagecache_pthread_mutex_lock(&pagecache->cache_lock); block_to_read->status|= PCBLOCK_ERROR; - block->status|= PCBLOCK_ERROR; - block_to_read->error= block->error= (int16) my_errno; + block_to_read->error= (int16) my_errno; pagecache->big_block_free(&data); - if (block_to_read != block) - { - remove_reader(block_to_read); - unreg_request(pagecache, block_to_read, 1); - } - /* Signal that all pending requests for this page now can be processed */ - if (block->wqueue[COND_FOR_REQUESTED].last_thread) - wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); - DBUG_RETURN(FALSE); // no retry + goto error; } /* @@ -2914,12 +2922,18 @@ static my_bool read_big_block(PAGECACHE *pagecache, TRUE /*register*/, TRUE /*fast*/, &page_st); if (!bl) { - // we run out of easy avaliable pages in the cache - break; + /* + We can not get this page easy. + Maybe we will be lucky with other pages, + also among other pages can be page which wated by other thread + */ + continue; } DBUG_ASSERT(bl == bl->hash_link->block); if ((bl->status & PCBLOCK_ERROR) == 0 && - page_st == PAGE_TO_BE_READ) + (page_st == PAGE_TO_BE_READ || // page shoud be read + (page_st == PAGE_WAIT_TO_BE_READ && + (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread { memcpy(bl->buffer, data.str + offset, pagecache->block_size); bl->status|= PCBLOCK_READ; @@ -2940,6 +2954,8 @@ static my_bool read_big_block(PAGECACHE *pagecache, pagecache->big_block_free(&data); block_to_read->status&= ~PCBLOCK_BIG_READ; + +end: if (block_to_read != block) { remove_reader(block_to_read); @@ -2947,10 +2963,56 @@ static my_bool read_big_block(PAGECACHE *pagecache, } if (block->wqueue[COND_FOR_BIG_BLOCK].last_thread) wqueue_release_queue(&block->wqueue[COND_FOR_BIG_BLOCK]); + /* Signal that all pending requests for this page now can be processed */ if (block->wqueue[COND_FOR_REQUESTED].last_thread) wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); + DBUG_VOID_RETURN; - DBUG_RETURN(FALSE); +error: + /* + Read failed. Mark all readers waiting for the a block covered by the + big block that the read failed + */ + for (offset= pagecache->block_size, page= page_to_read + 1; + offset < data.length; + offset+= pagecache->block_size, page++) + { + DBUG_ASSERT(offset + pagecache->block_size <= data.length); + if (page == our_page) + { + DBUG_ASSERT(!(block->status & PCBLOCK_READ)); + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + } + else + { + PAGECACHE_BLOCK_LINK *bl; + bl= find_block(pagecache, &block->hash_link->file, page, 1, + FALSE, TRUE /* copy under protection (?)*/, + TRUE /*register*/, TRUE /*fast*/, &page_st); + if (!bl) + { + /* + We can not get this page easy. + Maybe we will be lucky with other pages, + also among other pages can be page which wated by other thread + */ + continue; + } + DBUG_ASSERT(bl == bl->hash_link->block); + if ((bl->status & PCBLOCK_ERROR) == 0 && + (page_st == PAGE_TO_BE_READ || // page shoud be read + (page_st == PAGE_WAIT_TO_BE_READ && + (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread + { + bl->status|= PCBLOCK_ERROR; + bl->error= (int16) my_errno; + } + remove_reader(bl); + unreg_request(pagecache, bl, 1); + } + } + goto end; } #endif /* WITH_S3_STORAGE_ENGINE */ @@ -3706,14 +3768,8 @@ restart: /* It is big read and this thread should read */ DBUG_ASSERT(page_st == PAGE_TO_BE_READ); - if (read_big_block(pagecache, block)) - { - /* block is unregistered in read_big_block */ - pagecache_pthread_mutex_unlock(&pagecache->cache_lock); - dec_counter_for_resize_op(pagecache); - DBUG_PRINT("restart", ("big block fail, restarting...")); - goto restart; - } + read_big_block(pagecache, block); + if (!((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) || (new_pin == PAGECACHE_PIN))) { |