summaryrefslogtreecommitdiff
path: root/storage/innobase/include/log0recv.h
blob: 57c2f0b54335eb06ba3dc31731bd9825f3ca52d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/*****************************************************************************

Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2021, MariaDB Corporation.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA

*****************************************************************************/

/**************************************************//**
@file include/log0recv.h
Recovery

Created 9/20/1997 Heikki Tuuri
*******************************************************/

#pragma once

#include "ut0new.h"
#include "buf0types.h"
#include "log0log.h"
#include "mtr0types.h"

#include <deque>
#include <map>

/** @return whether recovery is currently running. */
#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)

/** Find the latest checkpoint in the log header.
@param[out]	max_field	LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
@return error code or DB_SUCCESS */
dberr_t
recv_find_max_checkpoint(ulint* max_field)
	MY_ATTRIBUTE((nonnull, warn_unused_result));

/** Apply any buffered redo log to a page that was just read from a data file.
@param[in,out]	space	tablespace
@param[in,out]	bpage	buffer pool page */
ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
	MY_ATTRIBUTE((nonnull));

/** Start recovering from a redo log checkpoint.
@param[in]	flush_lsn	FIL_PAGE_FILE_FLUSH_LSN
of first system tablespace page
@return error code or DB_SUCCESS */
dberr_t
recv_recovery_from_checkpoint_start(
	lsn_t	flush_lsn);

/** Whether to store redo log records in recv_sys.pages */
enum store_t {
	/** Do not store redo log records. */
	STORE_NO,
	/** Store redo log records. */
	STORE_YES,
	/** Store redo log records if the tablespace exists. */
	STORE_IF_EXISTS
};


/** Adds data from a new log block to the parsing buffer of recv_sys if
recv_sys.parse_start_lsn is non-zero.
@param[in]	log_block	log block to add
@param[in]	scanned_lsn	lsn of how far we were able to find
				data in this log block
@return true if more data added */
bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn);

/** Moves the parsing buffer data left to the buffer start */
void recv_sys_justify_left_parsing_buf();

/** Report an operation to create, delete, or rename a file during backup.
@param[in]	space_id	tablespace identifier
@param[in]	create		whether the file is being created
@param[in]	name		file name (not NUL-terminated)
@param[in]	len		length of name, in bytes
@param[in]	new_name	new file name (NULL if not rename)
@param[in]	new_len		length of new_name, in bytes (0 if NULL) */
extern void (*log_file_op)(ulint space_id, bool create,
			   const byte* name, ulint len,
			   const byte* new_name, ulint new_len);

/** Stored redo log record */
struct log_rec_t
{
  log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
  log_rec_t()= delete;
  log_rec_t(const log_rec_t&)= delete;
  log_rec_t &operator=(const log_rec_t&)= delete;

  /** next record */
  log_rec_t *next;
  /** mtr_t::commit_lsn() of the mini-transaction */
  const lsn_t lsn;
};

struct recv_dblwr_t
{
  /** Add a page frame to the doublewrite recovery buffer. */
  void add(byte *page) { pages.push_front(page); }

  /** Validate the page.
  @param page_id  page identifier
  @param page     page contents
  @param space    the tablespace of the page (not available for page 0)
  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
  page_compressed or encrypted pages
  @return whether the page is valid */
  bool validate_page(const page_id_t page_id, const byte *page,
                     const fil_space_t *space, byte *tmp_buf);

  /** Find a doublewrite copy of a page.
  @param page_id  page identifier
  @param space    tablespace (not available for page_id.page_no()==0)
  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
  page_compressed or encrypted pages
  @return page frame
  @retval NULL if no valid page for page_id was found */
  byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL,
                  byte *tmp_buf= NULL);

  typedef std::deque<byte*, ut_allocator<byte*> > list;

  /** Recovered doublewrite buffer page frames */
  list pages;
};

/** the recovery state and buffered records for a page */
struct page_recv_t
{
  /** Recovery state; protected by recv_sys.mutex */
  enum
  {
    /** not yet processed */
    RECV_NOT_PROCESSED,
    /** not processed; the page will be reinitialized */
    RECV_WILL_NOT_READ,
    /** page is being read */
    RECV_BEING_READ,
    /** log records are being applied on the page */
    RECV_BEING_PROCESSED
  } state= RECV_NOT_PROCESSED;
  /** Latest written byte offset when applying the log records.
  @see mtr_t::m_last_offset */
  uint16_t last_offset= 1;
  /** log records for a page */
  class recs_t
  {
    /** The first log record */
    log_rec_t *head= nullptr;
    /** The last log record */
    log_rec_t *tail= nullptr;
    friend struct page_recv_t;
  public:
    /** Append a redo log snippet for the page
    @param recs log snippet */
    void append(log_rec_t* recs)
    {
      if (tail)
        tail->next= recs;
      else
        head= recs;
      tail= recs;
    }

    /** @return the last log snippet */
    const log_rec_t* last() const { return tail; }
    /** @return the last log snippet */
    log_rec_t* last() { return tail; }

    class iterator
    {
      log_rec_t *cur;
    public:
      iterator(log_rec_t* rec) : cur(rec) {}
      log_rec_t* operator*() const { return cur; }
      iterator &operator++() { cur= cur->next; return *this; }
      bool operator!=(const iterator& i) const { return cur != i.cur; }
    };
    iterator begin() { return head; }
    iterator end() { return NULL; }
    bool empty() const { ut_ad(!head == !tail); return !head; }
    /** Clear and free the records; @see recv_sys_t::alloc() */
    inline void clear();
  } log;

  /** Trim old log records for a page.
  @param start_lsn oldest log sequence number to preserve
  @return whether all the log for the page was trimmed */
  inline bool trim(lsn_t start_lsn);
  /** Ignore any earlier redo log records for this page. */
  inline void will_not_read();
  /** @return whether the log records for the page are being processed */
  bool is_being_processed() const { return state == RECV_BEING_PROCESSED; }
};

/** Recovery system data structure */
struct recv_sys_t
{
  /** mutex protecting apply_log_recs and page_recv_t::state */
  mysql_mutex_t mutex;
private:
  /** condition variable for
  !apply_batch_on || pages.empty() || found_corrupt_log || found_corrupt_fs */
  pthread_cond_t cond;
  /** whether recv_apply_hashed_log_recs() is running */
  bool apply_batch_on;
  /** set when finding a corrupt log block or record, or there is a
  log parsing buffer overflow */
  bool found_corrupt_log;
  /** set when an inconsistency with the file system contents is detected
  during log scan or apply */
  bool found_corrupt_fs;
public:
  /** whether we are applying redo log records during crash recovery */
  bool recovery_on;
  /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
  should apply log records*/
  bool apply_log_recs;
	byte*		buf;	/*!< buffer for parsing log records */
	ulint		len;	/*!< amount of data in buf */
	lsn_t		parse_start_lsn;
				/*!< this is the lsn from which we were able to
				start parsing log records and adding them to
				pages; zero if a suitable
				start point not found yet */
	lsn_t		scanned_lsn;
				/*!< the log data has been scanned up to this
				lsn */
	ulint		scanned_checkpoint_no;
				/*!< the log data has been scanned up to this
				checkpoint number (lowest 4 bytes) */
	ulint		recovered_offset;
				/*!< start offset of non-parsed log records in
				buf */
	lsn_t		recovered_lsn;
				/*!< the log records have been parsed up to
				this lsn */
	lsn_t		mlog_checkpoint_lsn;
				/*!< the LSN of a FILE_CHECKPOINT
				record, or 0 if none was parsed */
	/** the time when progress was last reported */
	time_t		progress_time;

  using map = std::map<const page_id_t, page_recv_t,
                       std::less<const page_id_t>,
                       ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
  /** buffered records waiting to be applied to pages */
  map pages;

private:
  /** Process a record that indicates that a tablespace size is being shrunk.
  @param page_id first page that is not in the file
  @param lsn     log sequence number of the shrink operation */
  inline void trim(const page_id_t page_id, lsn_t lsn);

  /** Undo tablespaces for which truncate has been logged
  (indexed by page_id_t::space() - srv_undo_space_id_start) */
  struct trunc
  {
    /** log sequence number of FILE_CREATE, or 0 if none */
    lsn_t lsn;
    /** truncated size of the tablespace, or 0 if not truncated */
    unsigned pages;
  } truncated_undo_spaces[127];

public:
  /** The contents of the doublewrite buffer */
  recv_dblwr_t dblwr;

  /** Last added LSN to pages. */
  lsn_t last_stored_lsn= 0;

  void read(os_offset_t offset, span<byte> buf);
  inline size_t files_size();
  void close_files() { files.clear(); files.shrink_to_fit(); }

private:
  /** Attempt to initialize a page based on redo log records.
  @param page_id  page identifier
  @param p        iterator pointing to page_id
  @param mtr      mini-transaction
  @param b        pre-allocated buffer pool block
  @return whether the page was successfully initialized */
  inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p,
                                  mtr_t &mtr, buf_block_t *b);
  /** Attempt to initialize a page based on redo log records.
  @param page_id  page identifier
  @return the recovered block
  @retval nullptr if the page cannot be initialized based on log records */
  buf_block_t *recover_low(const page_id_t page_id);

  /** All found log files (multiple ones are possible if we are upgrading
  from before MariaDB Server 10.5.1) */
  std::vector<log_file_t> files;

  void open_log_files_if_needed();

  /** Base node of the redo block list.
  List elements are linked via buf_block_t::unzip_LRU. */
  UT_LIST_BASE_NODE_T(buf_block_t) blocks;
public:
  /** Check whether the number of read redo log blocks exceeds the maximum.
  Store last_stored_lsn if the recovery is not in the last phase.
  @param[in,out] store    whether to store page operations
  @return whether the memory is exhausted */
  inline bool is_memory_exhausted(store_t *store);
  /** Apply buffered log to persistent data pages.
  @param last_batch     whether it is possible to write more redo log */
  void apply(bool last_batch);

#ifdef UNIV_DEBUG
  /** whether all redo log in the current batch has been applied */
  bool after_apply= false;
#endif
  /** Initialize the redo log recovery subsystem. */
  void create();

  /** Free most recovery data structures. */
  void debug_free();

  /** Clean up after create() */
  void close();

  bool is_initialised() const { return last_stored_lsn != 0; }

  /** Register a redo log snippet for a page.
  @param page_id  page identifier
  @param start_lsn start LSN of the mini-transaction
  @param lsn      @see mtr_t::commit_lsn()
  @param l        redo log snippet @see log_t::FORMAT_10_5
  @param len      length of l, in bytes */
  inline void add(const page_id_t page_id, lsn_t start_lsn, lsn_t lsn,
                  const byte *l, size_t len);

  /** Parse and register one mini-transaction in log_t::FORMAT_10_5.
  @param checkpoint_lsn  the log sequence number of the latest checkpoint
  @param store           whether to store the records
  @param apply           whether to apply file-level log records
  @return whether FILE_CHECKPOINT record was seen the first time,
  or corruption was noticed */
  bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply);

  /** Clear a fully processed set of stored redo log records. */
  inline void clear();

  /** Determine whether redo log recovery progress should be reported.
  @param time  the current time
  @return whether progress should be reported
  (the last report was at least 15 seconds ago) */
  bool report(time_t time)
  {
    if (time - progress_time < 15)
      return false;

    progress_time= time;
    return true;
  }

  /** The alloc() memory alignment, in bytes */
  static constexpr size_t ALIGNMENT= sizeof(size_t);

  /** Allocate memory for log_rec_t
  @param len  allocation size, in bytes
  @return pointer to len bytes of memory (never NULL) */
  inline void *alloc(size_t len);

  /** Free a redo log snippet.
  @param data buffer returned by alloc() */
  inline void free(const void *data);

  /** Remove records for a corrupted page.
  This function should only be called when innodb_force_recovery is set.
  @param page_id  corrupted page identifier */
  ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);

  /** Flag data file corruption during recovery. */
  ATTRIBUTE_COLD void set_corrupt_fs();
  /** Flag log file corruption during recovery. */
  ATTRIBUTE_COLD void set_corrupt_log();
  /** Possibly finish a recovery batch. */
  inline void maybe_finish_batch();

  /** @return whether data file corruption was found */
  bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
  /** @return whether log file corruption was found */
  bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }

  /** Attempt to initialize a page based on redo log records.
  @param page_id  page identifier
  @return the recovered block
  @retval nullptr if the page cannot be initialized based on log records */
  buf_block_t *recover(const page_id_t page_id)
  {
    return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
  }

  /** Try to recover a tablespace that was not readable earlier
  @param p          iterator, initially pointing to page_id_t{space_id,0};
                    the records will be freed and the iterator advanced
  @param name       tablespace file name
  @param free_block spare buffer block
  @return whether recovery failed */
  bool recover_deferred(map::iterator &p, const std::string &name,
                        buf_block_t *&free_block);
};

/** The recovery system */
extern recv_sys_t	recv_sys;

/** If the following is TRUE, the buffer pool file pages must be invalidated
after recovery and no ibuf operations are allowed; this will be set if
recv_sys.pages becomes too full, and log records must be merged
to file pages already before the recovery is finished: in this case no
ibuf operations are allowed, as they could modify the pages read in the
buffer pool before the pages have been recovered to the up-to-date state.

TRUE means that recovery is running and no operations on the log files
are allowed yet: the variable name is misleading. */
extern bool		recv_no_ibuf_operations;
/** TRUE when recv_init_crash_recovery() has been called. */
extern bool		recv_needed_recovery;
#ifdef UNIV_DEBUG
/** TRUE if writing to the redo log (mtr_commit) is forbidden.
Protected by log_sys.mutex. */
extern bool		recv_no_log_write;
#endif /* UNIV_DEBUG */

/** TRUE if buf_page_is_corrupted() should check if the log sequence
number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
recv_recovery_from_checkpoint_start(). */
extern bool		recv_lsn_checks_on;

/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
times! */
#define RECV_PARSING_BUF_SIZE	(2U << 20)

/** Size of block reads when the log groups are scanned forward to do a
roll-forward */
#define RECV_SCAN_SIZE		(4U << srv_page_size_shift)