summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorThirunarayanan Balathandayuthapani <thiru@mariadb.com>2020-06-11 22:52:47 +0530
committerThirunarayanan Balathandayuthapani <thiru@mariadb.com>2020-06-12 09:17:51 +0530
commitc92f7e287fc0e21dc1b181284b1f8e2139d1c331 (patch)
tree259aa446838140f83ec4d5f8fd8d6aa9301d2652 /storage
parent07d1c8567cbfe94398a9857c47fb9919cad42651 (diff)
downloadmariadb-git-c92f7e287fc0e21dc1b181284b1f8e2139d1c331.tar.gz
MDEV-8139 Fix Scrubbing
fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
Diffstat (limited to 'storage')
-rw-r--r--storage/innobase/btr/btr0btr.cc27
-rw-r--r--storage/innobase/buf/buf0buf.cc14
-rw-r--r--storage/innobase/buf/buf0flu.cc93
-rw-r--r--storage/innobase/fil/fil0fil.cc5
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc23
-rw-r--r--storage/innobase/include/buf0buf.h6
-rw-r--r--storage/innobase/include/fil0fil.h232
-rw-r--r--storage/innobase/include/mtr0log.h15
-rw-r--r--storage/innobase/include/mtr0mtr.h52
-rw-r--r--storage/innobase/log/log0recv.cc50
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc28
-rw-r--r--storage/innobase/trx/trx0purge.cc4
12 files changed, 473 insertions, 76 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index ce72bdd7ef5..03eb1e076e4 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -749,11 +749,6 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
should remain exclusively latched until mtr_t::commit() or until it
is explicitly freed from the mini-transaction. */
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
-
- /* MDEV-15528 FIXME: Zero out the page after the redo log for
- this mini-transaction has been durably written.
- This must be done unconditionally if
- srv_immediate_scrub_data_uncompressed is set. */
}
/** Set the child page number in a node pointer record.
@@ -959,9 +954,8 @@ have been called.
In a persistent tablespace, the caller must invoke fsp_init_file_page()
before mtr.commit().
@param[in,out] block index root page
-@param[in,out] mtr mini-transaction
-@param[in] invalidate whether to invalidate PAGE_INDEX_ID */
-static void btr_free_root(buf_block_t *block, mtr_t *mtr, bool invalidate)
+@param[in,out] mtr mini-transaction */
+static void btr_free_root(buf_block_t *block, mtr_t *mtr)
{
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
@@ -973,16 +967,6 @@ static void btr_free_root(buf_block_t *block, mtr_t *mtr, bool invalidate)
ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame,
block->page.id().space()));
#endif /* UNIV_BTR_DEBUG */
- if (invalidate)
- {
- constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID;
-
- byte *page_index_id= my_assume_aligned<2>(field + block->frame);
- if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id,
- BTR_FREED_INDEX_ID) &&
- UNIV_LIKELY_NULL(block->page.zip.data))
- memcpy_aligned<2>(&block->page.zip.data[field], page_index_id, 8);
- }
/* Free the entire segment in small steps. */
while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr));
@@ -1099,8 +1083,7 @@ btr_create(
PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
/* Not enough space for new segment, free root
segment before return. */
- btr_free_root(block, mtr,
- !index || !index->table->is_temporary());
+ btr_free_root(block, mtr);
return(FIL_NULL);
}
@@ -1250,7 +1233,7 @@ btr_free_if_exists(
btr_free_but_not_root(root, mtr->get_log_mode());
mtr->set_named_space_id(page_id.space());
- btr_free_root(root, mtr, true);
+ btr_free_root(root, mtr);
}
/** Free an index tree in a temporary tablespace.
@@ -1265,7 +1248,7 @@ void btr_free(const page_id_t page_id)
if (block) {
btr_free_but_not_root(block, MTR_LOG_NO_REDO);
- btr_free_root(block, &mtr, false);
+ btr_free_root(block, &mtr);
}
mtr.commit();
}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 47b4eac0ed2..f3b819056d0 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2579,12 +2579,13 @@ void buf_page_free(const page_id_t page_id,
buf_block_t *block= reinterpret_cast<buf_block_t*>
(buf_pool.page_hash_get_low(page_id));
+ if (srv_immediate_scrub_data_uncompressed || mtr->is_page_compressed())
+ mtr->add_freed_offset(page_id);
+
if (!block || block->page.state() != BUF_BLOCK_FILE_PAGE)
{
/* FIXME: if block!=NULL, convert to BUF_BLOCK_FILE_PAGE,
but avoid buf_zip_decompress() */
- /* FIXME: If block==NULL, introduce a separate data structure
- to cover freed page ranges to augment buf_flush_freed_page() */
rw_lock_s_unlock(hash_lock);
return;
}
@@ -3793,16 +3794,20 @@ void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
from a file even if it cannot be found in the buffer buf_pool. This is one
of the functions which perform to a block a state transition NOT_USED =>
FILE_PAGE (the other is buf_page_get_gen).
-@param[in] page_id page id
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in,out] mtr mini-transaction
@return pointer to the block, page bufferfixed */
buf_block_t*
-buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr)
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr)
{
+ page_id_t page_id(space->id, offset);
ut_ad(mtr->is_active());
ut_ad(page_id.space() != 0 || !zip_size);
+ space->free_page(offset, false);
buf_block_t *free_block= buf_LRU_get_free_block(false);
free_block->initialise(page_id, zip_size, 1);
@@ -3831,7 +3836,6 @@ buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr)
return buf_page_get_gen(page_id, zip_size, RW_NO_LATCH,
block, BUF_GET_POSSIBLY_FREED,
__FILE__, __LINE__, mtr);
-
mutex_exit(&recv_sys.mutex);
block= buf_page_get_with_no_latch(page_id, zip_size, mtr);
mutex_enter(&recv_sys.mutex);
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 8306f698289..22d94762757 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -978,43 +978,22 @@ not_compressed:
This function also resets the IO_FIX to IO_NONE and making the
page status as NORMAL. It initiates the write to the file only after
releasing the page from flush list and its associated mutex.
-@param[in,out] bpage freed buffer page
-@param[in] space tablespace object of the freed page */
-static void buf_flush_freed_page(buf_page_t *bpage, const fil_space_t &space)
+@param[in,out] bpage freed buffer page */
+static void buf_release_freed_page(buf_page_t *bpage)
{
ut_ad(bpage->in_file());
const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
- const page_id_t page_id(bpage->id());
- const auto zip_size= bpage->zip_size();
mutex_enter(&buf_pool.mutex);
bpage->set_io_fix(BUF_IO_NONE);
bpage->status= buf_page_t::NORMAL;
buf_flush_remove(bpage);
- buf_pool.stat.n_pages_written++;
- mutex_exit(&buf_pool.mutex);
if (uncompressed)
rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
BUF_IO_WRITE);
- const bool punch_hole=
-#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
- space.is_compressed() ||
-#endif
- false;
-
- ut_ad(space.id == page_id.space());
- ut_ad(space.zip_size() == zip_size);
-
- if (punch_hole || srv_immediate_scrub_data_uncompressed)
- {
- fil_io_t fio= fil_io(IORequestWrite, punch_hole, page_id, zip_size, 0,
- zip_size ? zip_size : srv_page_size,
- const_cast<byte*>(field_ref_zero), nullptr, false,
- punch_hole);
- if (punch_hole && fio.node)
- fio.node->space->release_for_io();
- }
+ buf_LRU_free_page(bpage, true);
+ mutex_exit(&buf_pool.mutex);
}
/** Write a flushable page from buf_pool to a file.
@@ -1192,7 +1171,7 @@ bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type,
switch (status) {
default:
ut_ad(status == buf_page_t::FREED);
- buf_flush_freed_page(bpage, *space);
+ buf_release_freed_page(bpage);
goto done;
case buf_page_t::NORMAL:
use_doublewrite= space->use_doublewrite();
@@ -1322,7 +1301,64 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
return i;
}
-/** Flushes to disk all flushable pages within the flush area.
+/** Write punch-hole or zeroes of the freed ranges when
+innodb_immediate_scrub_data_uncompressed from the freed ranges.
+@param[in] space tablespace which contains freed ranges
+@param[in] freed_ranges freed ranges of the page to be flushed */
+static void buf_flush_freed_pages(fil_space_t *space)
+{
+ ut_ad(space != NULL);
+ if (!srv_immediate_scrub_data_uncompressed && !space->is_compressed())
+ return;
+ lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
+
+ std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex);
+ if (space->freed_ranges.empty()
+ || flush_to_disk_lsn < space->get_last_freed_lsn())
+ {
+ freed_lock.unlock();
+ return;
+ }
+
+ range_set freed_ranges= std::move(space->freed_ranges);
+ freed_lock.unlock();
+ const bool punch_hole=
+#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+ space->is_compressed() ||
+#endif
+ false;
+
+ for (const auto &range : freed_ranges)
+ {
+ ulint page_size= space->zip_size();
+ if (!page_size)
+ page_size= srv_page_size;
+
+ if (punch_hole)
+ {
+ const auto len= (range.last - range.first + 1) * page_size;
+ const page_id_t page_id(space->id, range.first);
+ fil_io_t fio= fil_io(IORequestWrite, true, page_id, space->zip_size(),
+ 0, len, nullptr, nullptr, false, true);
+ if (fio.node)
+ fio.node->space->release_for_io();
+ }
+ else if (srv_immediate_scrub_data_uncompressed)
+ {
+ for (auto i= range.first; i <= range.last; i++)
+ {
+ const page_id_t page_id(space->id, i);
+ fil_io(IORequestWrite, false, page_id, space->zip_size(), 0,
+ space->zip_size() ? space->zip_size() : srv_page_size,
+ const_cast<byte*>(field_ref_zero), nullptr, false, false);
+ }
+ }
+ buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
+ }
+}
+
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
@param[in] page_id page id
@param[in] flush LRU or FLUSH_LIST
@param[in] n_flushed number of pages flushed so far in this batch
@@ -1344,6 +1380,9 @@ buf_flush_try_neighbors(
return 0;
}
+ /* Flush the freed ranges while flushing the neighbors */
+ buf_flush_freed_pages(space);
+
page_id_t id = page_id;
page_id_t high = (srv_flush_neighbors != 1
|| UT_LIST_GET_LEN(buf_pool.LRU)
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 5ba5b0f703e..e3fdc393564 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1065,6 +1065,7 @@ fil_space_free_low(
rw_lock_free(&space->latch);
fil_space_destroy_crypt_data(&space->crypt_data);
+ space->~fil_space_t();
ut_free(space->name);
ut_free(space);
}
@@ -1157,7 +1158,9 @@ fil_space_create(
return(NULL);
}
- space = static_cast<fil_space_t*>(ut_zalloc_nokey(sizeof(*space)));
+ /* FIXME: if calloc() is defined as an inline function that calls
+ memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */
+ space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
space->id = id;
space->name = mem_strdup(name);
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 2d3f6cbc2e9..94a11778beb 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -554,7 +554,7 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr)
mtr_x_lock_space(space, mtr);
const auto savepoint = mtr->get_savepoint();
- buf_block_t* block = buf_page_create(page_id, zip_size, mtr);
+ buf_block_t* block = buf_page_create(space, 0, zip_size, mtr);
mtr->sx_latch_at_savepoint(savepoint, block);
buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -875,8 +875,9 @@ fsp_fill_free_list(
if (i > 0) {
const auto savepoint = mtr->get_savepoint();
- block= buf_page_create(page_id_t(space->id, i),
- zip_size, mtr);
+ block= buf_page_create(
+ space, static_cast<uint32_t>(i),
+ zip_size, mtr);
mtr->sx_latch_at_savepoint(savepoint, block);
buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -898,8 +899,9 @@ fsp_fill_free_list(
ibuf_mtr.set_named_space(space);
block = buf_page_create(
- page_id_t(space->id,
- i + FSP_IBUF_BITMAP_OFFSET),
+ space,
+ static_cast<uint32_t>(
+ i + FSP_IBUF_BITMAP_OFFSET),
zip_size, &ibuf_mtr);
ibuf_mtr.sx_latch_at_savepoint(0, block);
buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -1059,8 +1061,9 @@ fsp_page_create(
rw_lock_type_t rw_latch,
mtr_t* mtr)
{
- buf_block_t* block = buf_page_create(page_id_t(space->id, offset),
- space->zip_size(), mtr);
+ buf_block_t* block = buf_page_create(
+ space, static_cast<uint32_t>(offset),
+ space->zip_size(), mtr);
/* The latch may already have been acquired, so we cannot invoke
mtr_t::x_latch_at_savepoint() or mtr_t::sx_latch_at_savepoint(). */
@@ -1251,7 +1254,7 @@ static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr)
return;
}
- mtr->free(page_id_t(space->id, offset));
+ mtr->free(*space, static_cast<uint32_t>(offset));
const ulint bit = offset % FSP_EXTENT_SIZE;
@@ -2557,7 +2560,7 @@ fseg_free_page_low(
fsp_free_extent(space, offset, mtr);
}
- mtr->free(page_id_t(space->id, offset));
+ mtr->free(*space, static_cast<uint32_t>(offset));
}
/** Free a page in a file segment.
@@ -2674,7 +2677,7 @@ fseg_free_extent(
for (ulint i = 0; i < FSP_EXTENT_SIZE; i++) {
if (!xdes_is_free(descr, i)) {
buf_page_free(
- page_id_t(space->id, first_page_in_extent + i),
+ page_id_t(space->id, first_page_in_extent + 1),
mtr, __FILE__, __LINE__);
}
}
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 825eb7631fe..73c153cf6d7 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -340,12 +340,14 @@ buf_page_get_low(
from a file even if it cannot be found in the buffer buf_pool. This is one
of the functions which perform to a block a state transition NOT_USED =>
FILE_PAGE (the other is buf_page_get_gen).
-@param[in] page_id page id
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in,out] mtr mini-transaction
@return pointer to the block, page bufferfixed */
buf_block_t*
-buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr);
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr);
/********************************************************************//**
Releases a compressed-only page acquired with buf_page_get_zip(). */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 62228db822f..09496a2c5ca 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -37,9 +37,8 @@ Created 10/25/1995 Heikki Tuuri
#include "log0recv.h"
#include "dict0types.h"
#include "ilist.h"
-#ifdef UNIV_LINUX
-# include <set>
-#endif
+#include <set>
+#include <mutex>
struct unflushed_spaces_tag_t;
struct rotation_list_tag_t;
@@ -111,6 +110,175 @@ enum fil_type_t {
struct fil_node_t;
+/** Structure to store first and last value of range */
+struct range_t
+{
+ uint32_t first;
+ uint32_t last;
+};
+
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+ bool operator() (const range_t lhs, const range_t rhs) const
+ {
+ return lhs.first < rhs.first;
+ }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+ range_set_t ranges;
+public:
+ /** Merge the current range with previous range.
+ @param[in] range range to be merged
+ @param[in] prev_range range to be merged with next */
+ void merge_range(range_set_t::iterator range,
+ range_set_t::iterator prev_range)
+ {
+ if (range->first != prev_range->last + 1)
+ return;
+
+ /* Merge the current range with previous range */
+ range_t new_range {prev_range->first, range->last};
+ ranges.erase(prev_range);
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Split the range and add two more ranges
+ @param[in] range range to be split
+ @param[in] value Value to be removed from range */
+ void split_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t split1{range->first, value - 1};
+ range_t split2{value + 1, range->last};
+
+ /* Remove the existing element */
+ ranges.erase(range);
+
+ /* Insert the two elements */
+ ranges.emplace(split1);
+ ranges.emplace(split2);
+ }
+
+ /** Remove the value with the given range
+ @param[in,out] range range to be changed
+ @param[in] value value to be removed */
+ void remove_within_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t new_range{range->first, range->last};
+ if (value == range->first)
+ {
+ if (range->first == range->last)
+ {
+ ranges.erase(range);
+ return;
+ }
+ else
+ new_range.first++;
+ }
+ else if (value == range->last)
+ new_range.last--;
+ else if (range->first < value && range->last > value)
+ return split_range(range, value);
+
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Remove the value from the ranges.
+ @param[in] value Value to be removed. */
+ void remove_value(uint32_t value)
+ {
+ if (ranges.empty())
+ return;
+ range_t new_range {value, value};
+ range_set_t::iterator range= ranges.lower_bound(new_range);
+ if (range == ranges.end())
+ return remove_within_range(std::prev(range), value);
+
+ if (range->first > value && range != ranges.begin())
+ /* Iterate the previous ranges to delete */
+ return remove_within_range(std::prev(range), value);
+ return remove_within_range(range, value);
+ }
+ /** Add the value within the existing range
+ @param[in] range_set::add_rangerange range to be modified
+ @param[in] value value to be added */
+ range_set_t::iterator add_within_range(range_set_t::iterator range,
+ uint32_t value)
+ {
+ if (range->first <= value && range->last >= value)
+ return range;
+
+ range_t new_range{range->first, range->last};
+ if (range->last + 1 == value)
+ new_range.last++;
+ else if (range->first - 1 == value)
+ new_range.first--;
+ else return ranges.end();
+ ranges.erase(range);
+ return ranges.emplace(new_range).first;
+ }
+ /** Add the range in the ranges set
+ @param[in] new_range range to be added */
+ void add_range(range_t new_range)
+ {
+ auto r_offset= ranges.lower_bound(new_range);
+ auto r_begin= ranges.begin();
+ auto r_end= ranges.end();
+ if (!ranges.size())
+ {
+new_range:
+ ranges.emplace(new_range);
+ return;
+ }
+
+ if (r_offset == r_end)
+ {
+ /* last range */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset == r_begin)
+ {
+ /* First range */
+ if (add_within_range(r_offset, new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset->first - 1 == new_range.first)
+ {
+ /* Change starting of the existing range */
+ auto r_value= add_within_range(r_offset, new_range.first);
+ if (r_value != ranges.begin())
+ merge_range(r_value, std::prev(r_value));
+ }
+ else
+ {
+ /* previous range last_value alone */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ }
+
+ /** Add the value in the ranges
+ @param[in] value value to be added */
+ void add_value(uint32_t value)
+ {
+ range_t new_range{value, value};
+ add_range(new_range);
+ }
+
+ ulint size() { return ranges.size(); }
+ void clear() { ranges.clear(); }
+ bool empty() const { return ranges.empty(); }
+ typename range_set_t::iterator begin() { return ranges.begin(); }
+ typename range_set_t::iterator end() { return ranges.end(); }
+};
#endif
/** Tablespace or log data space */
@@ -203,6 +371,16 @@ struct fil_space_t
punch hole */
bool punch_hole;
+ /** mutex to protect freed ranges */
+ std::mutex freed_range_mutex;
+
+ /** Variables to store freed ranges. This can be used to write
+ zeroes/punch the hole in files. Protected by freed_mutex */
+ range_set freed_ranges;
+
+ /** Stores last page freed lsn. Protected by freed_mutex */
+ lsn_t last_freed_lsn;
+
ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
/** @return whether the tablespace is about to be dropped */
@@ -314,6 +492,22 @@ struct fil_space_t
ut_ad(0);
return false;
}
+
+ /** @return last_freed_lsn */
+ lsn_t get_last_freed_lsn() { return last_freed_lsn; }
+ /** Update last_freed_lsn */
+ void update_last_freed_lsn(lsn_t lsn)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ last_freed_lsn= lsn;
+ }
+
+ /** Clear all freed ranges */
+ void clear_freed_ranges()
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges.clear();
+ }
#endif /* !UNIV_INNOCHECKSUM */
/** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
check fsp0types.h to more info about flags. */
@@ -583,6 +777,38 @@ struct fil_space_t
return(ssize == 0 || !is_ibd
|| srv_page_size != UNIV_PAGE_SIZE_ORIG);
}
+
+#ifndef UNIV_INNOCHECKSUM
+ /** Add/remove the free page in the freed ranges list.
+ @param[in] offset page number to be added
+ @param[in] free true if page to be freed */
+ void free_page(uint32_t offset, bool add=true)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ if (add)
+ return freed_ranges.add_value(offset);
+
+ if (freed_ranges.empty())
+ return;
+
+ return freed_ranges.remove_value(offset);
+ }
+
+ /** Add the range of freed pages */
+ void add_free_ranges(range_set ranges)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges= std::move(ranges);
+ }
+
+ /** Add the set of freed page ranges */
+ void add_free_range(const range_t range)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges.add_range(range);
+ }
+#endif /*!UNIV_INNOCHECKSUM */
+
};
#ifndef UNIV_INNOCHECKSUM
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index afb9456ff30..cf5f7c751ee 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -524,11 +524,19 @@ inline void mtr_t::init(buf_block_t *b)
}
/** Free a page.
-@param id page identifier */
-inline void mtr_t::free(const page_id_t id)
+@param[in] space tablespace contains page to be freed
+@param[in] offset page offset to be freed */
+inline void mtr_t::free(fil_space_t &space, uint32_t offset)
{
+ page_id_t freed_page_id(space.id, offset);
if (m_log_mode == MTR_LOG_ALL)
- m_log.close(log_write<FREE_PAGE>(id, nullptr));
+ m_log.close(log_write<FREE_PAGE>(freed_page_id, nullptr));
+
+ ut_ad(!m_user_space || m_user_space == &space);
+ if (&space == fil_system.sys_space)
+ freed_system_tablespace_page();
+ else
+ m_user_space= &space;
}
/** Write an EXTENDED log record.
@@ -651,4 +659,5 @@ inline void mtr_t::trim_pages(const page_id_t id)
byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
*l++= TRIM_PAGES;
m_log.close(l);
+ set_trim_pages();
}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index 7cc0939d115..cd1b9bef4aa 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -312,6 +312,24 @@ public:
/** @return true if we are inside the change buffer code */
bool is_inside_ibuf() const { return m_inside_ibuf; }
+ /** Note that system tablespace page has been freed. */
+ void freed_system_tablespace_page() { m_freed_in_system_tablespace= true; }
+
+ /** Note that pages has been trimed */
+ void set_trim_pages() { m_trim_pages= true; }
+
+ /** @return true if pages has been trimed */
+ bool is_trim_pages() { return m_trim_pages; }
+
+ /** @return whether a page_compressed table was modified */
+ bool is_page_compressed() const
+ {
+#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+ return m_user_space && m_user_space->is_compressed();
+#else
+ return false;
+#endif
+ }
#ifdef UNIV_DEBUG
/** Check if we are holding an rw-latch in this mini-transaction
@param lock latch to search for
@@ -348,6 +366,12 @@ public:
/** @return the memo stack */
mtr_buf_t* get_memo() { return &m_memo; }
+
+ /** @return true if system tablespace page has been freed */
+ bool is_freed_system_tablespace_page()
+ {
+ return m_freed_in_system_tablespace;
+ }
#endif /* UNIV_DEBUG */
/** @return true if a record was added to the mini-transaction */
@@ -470,8 +494,9 @@ public:
@param[in,out] b buffer page */
void init(buf_block_t *b);
/** Free a page.
- @param id page identifier */
- inline void free(const page_id_t id);
+ @param[in] space tablespace contains page to be freed
+ @param[in] offset page offset to be freed */
+ inline void free(fil_space_t &space, uint32_t offset);
/** Write log for partly initializing a B-tree or R-tree page.
@param block B-tree or R-tree page
@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
@@ -551,6 +576,20 @@ public:
const char *path,
const char *new_path= nullptr);
+ /** Add freed page numbers to freed_pages */
+ void add_freed_offset(page_id_t id)
+ {
+ ut_ad(m_user_space == NULL || id.space() == m_user_space->id);
+ m_freed_ranges.add_value(id.page_no());
+ }
+
+ /** Clear the freed pages */
+ void clear_freed_ranges()
+ {
+ m_freed_ranges.clear();
+ m_freed_in_system_tablespace= 0;
+ m_trim_pages= false;
+ }
private:
/** Log a write of a byte string to a page.
@param block buffer page
@@ -621,6 +660,12 @@ private:
to suppress some read-ahead operations, @see ibuf_inside() */
uint16_t m_inside_ibuf:1;
+ /** whether the page has been freed in system tablespace */
+ uint16_t m_freed_in_system_tablespace:1;
+
+ /** whether the pages has been trimmed */
+ uint16_t m_trim_pages:1;
+
#ifdef UNIV_DEBUG
/** Persistent user tablespace associated with the
mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
@@ -638,6 +683,9 @@ private:
/** LSN at commit time */
lsn_t m_commit_lsn;
+
+ /** set of freed page ids */
+ range_set m_freed_ranges;
};
#include "mtr0mtr.ic"
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 5c6b665a604..5dc9e70df63 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -543,11 +543,24 @@ struct file_name_t {
/** FSP_SIZE of tablespace */
ulint size;
+ /** Freed pages of tablespace */
+ range_set freed_ranges;
+
/** Constructor */
file_name_t(std::string name_, bool deleted)
: name(std::move(name_)), space(NULL),
status(deleted ? DELETED: NORMAL),
size(0) {}
+
+ /** Add the freed pages */
+ void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); }
+
+ /** Remove the freed pages */
+ void remove_freed_page(uint32_t page_no)
+ {
+ if (freed_ranges.empty()) return;
+ freed_ranges.remove_value(page_no);
+ }
};
/** Map of dirty tablespaces during recovery */
@@ -1764,6 +1777,34 @@ append:
log_phys_t(start_lsn, lsn, l, len));
}
+/** Store/remove the freed pages in fil_name_t of recv_spaces.
+@param[in] page_id freed or init page_id
+@param[in] freed TRUE if page is freed */
+static void store_freed_or_init_rec(page_id_t page_id, bool freed)
+{
+ uint32_t space_id= page_id.space();
+ uint32_t page_no= page_id.page_no();
+ if (is_predefined_tablespace(space_id))
+ {
+ fil_space_t *space;
+ if (space_id == TRX_SYS_SPACE)
+ space= fil_system.sys_space;
+ else
+ space= fil_space_get(space_id);
+
+ space->free_page(page_no, freed);
+ return;
+ }
+
+ recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+ if (i != recv_spaces.end() && i->first == space_id)
+ {
+ if (freed)
+ i->second.add_freed_page(page_no);
+ else
+ i->second.remove_freed_page(page_no);
+ }
+}
/** Parse and register one mini-transaction in log_t::FORMAT_10_5.
@param checkpoint_lsn the log sequence number of the latest checkpoint
@@ -1963,6 +2004,7 @@ same_page:
case INIT_PAGE:
last_offset= FIL_PAGE_TYPE;
free_or_init_page:
+ store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE);
if (UNIV_UNLIKELY(rlen != 0))
goto record_corrupted;
break;
@@ -2531,7 +2573,7 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
{
mtr.start();
mtr.set_log_mode(MTR_LOG_NONE);
- block= buf_page_create(page_id, space->zip_size(), &mtr);
+ block= buf_page_create(space, page_id.page_no(), space->zip_size(), &mtr);
p= recv_sys.pages.find(page_id);
if (p == recv_sys.pages.end())
{
@@ -3240,6 +3282,12 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
/* The tablespace was found, and there
are some redo log records for it. */
fil_names_dirty(rs.second.space);
+
+ /* Add the freed page ranges in the respective
+ tablespace */
+ if (!rs.second.freed_ranges.empty())
+ rs.second.space->add_free_ranges(
+ std::move(rs.second.freed_ranges));
} else if (rs.second.name == "") {
ib::error() << "Missing FILE_CREATE, FILE_DELETE"
" or FILE_MODIFY before FILE_CHECKPOINT"
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 8ca0fe65f1e..32e31ee84f4 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -372,6 +372,7 @@ void mtr_t::start()
ut_d(m_user_space_id= TRX_SYS_SPACE);
m_user_space= nullptr;
m_commit_lsn= 0;
+ m_freed_in_system_tablespace= m_trim_pages= false;
}
/** Release the resources */
@@ -381,6 +382,7 @@ inline void mtr_t::release_resources()
ut_d(m_memo.for_each_block_in_reverse(CIterate<DebugCheck>()));
m_log.erase();
m_memo.erase();
+ clear_freed_ranges();
ut_d(m_commit= true);
}
@@ -413,6 +415,30 @@ void mtr_t::commit()
to insert into the flush list. */
log_mutex_exit();
+ if (!m_freed_ranges.empty())
+ {
+ fil_space_t *freed_space= m_user_space;
+ /* Get the freed tablespace in case of predefined tablespace */
+ if (!freed_space)
+ {
+ ut_ad(is_freed_system_tablespace_page());
+ freed_space= fil_system.sys_space;
+ }
+
+ ut_ad(memo_contains(freed_space->latch, MTR_MEMO_X_LOCK));
+ /* Update the last freed lsn */
+ freed_space->update_last_freed_lsn(m_commit_lsn);
+
+ for (const auto &range : m_freed_ranges)
+ freed_space->add_free_range(range);
+ }
+
+ if (is_trim_pages())
+ {
+ ut_ad(m_user_space != nullptr);
+ m_user_space->clear_freed_ranges();
+ }
+
m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
(ReleaseBlocks(start_lsn, m_commit_lsn)));
if (m_made_dirty)
@@ -441,6 +467,8 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn)
ut_ad(!m_made_dirty);
ut_ad(m_memo.size() == 0);
ut_ad(!srv_read_only_mode);
+ ut_ad(m_freed_ranges.empty());
+ ut_ad(!m_freed_in_system_tablespace);
if (checkpoint_lsn) {
byte* ptr = m_log.push<byte*>(SIZE_OF_FILE_CHECKPOINT);
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index c37a8b98cbd..6747ada6de4 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -696,6 +696,10 @@ not_free:
const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
mtr.start();
mtr_x_lock_space(purge_sys.truncate.current, &mtr);
+ /* Associate the undo tablespace with mtr.
+ During mtr::commit(), InnoDB can use the undo
+ tablespace object to clear all freed ranges */
+ mtr.set_named_space(purge_sys.truncate.current);
mtr.trim_pages(page_id_t(space.id, size));
fsp_header_init(purge_sys.truncate.current, size, &mtr);
mutex_enter(&fil_system.mutex);