summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--innobase/btr/btr0btr.c71
-rw-r--r--innobase/btr/btr0cur.c820
-rw-r--r--innobase/buf/buf0buf.c81
-rw-r--r--innobase/buf/buf0flu.c195
-rw-r--r--innobase/buf/buf0rea.c14
-rw-r--r--innobase/data/data0data.c171
-rw-r--r--innobase/fil/fil0fil.c81
-rw-r--r--innobase/fsp/fsp0fsp.c4
-rw-r--r--innobase/ibuf/ibuf0ibuf.c7
-rw-r--r--innobase/include/btr0btr.h38
-rw-r--r--innobase/include/btr0cur.h123
-rw-r--r--innobase/include/buf0buf.h8
-rw-r--r--innobase/include/buf0flu.h2
-rw-r--r--innobase/include/data0data.h58
-rw-r--r--innobase/include/data0data.ic7
-rw-r--r--innobase/include/dict0mem.h2
-rw-r--r--innobase/include/fil0fil.h10
-rw-r--r--innobase/include/fsp0fsp.h2
-rw-r--r--innobase/include/mach0data.ic4
-rw-r--r--innobase/include/os0file.h18
-rw-r--r--innobase/include/rem0cmp.h13
-rw-r--r--innobase/include/rem0rec.h53
-rw-r--r--innobase/include/rem0rec.ic72
-rw-r--r--innobase/include/row0ins.h6
-rw-r--r--innobase/include/row0mysql.h8
-rw-r--r--innobase/include/row0row.h1
-rw-r--r--innobase/include/row0upd.h12
-rw-r--r--innobase/include/row0upd.ic5
-rw-r--r--innobase/include/srv0srv.h22
-rw-r--r--innobase/include/sync0sync.h2
-rw-r--r--innobase/include/trx0rec.h16
-rw-r--r--innobase/include/trx0rec.ic17
-rw-r--r--innobase/include/trx0sys.h70
-rw-r--r--innobase/include/trx0types.h1
-rw-r--r--innobase/include/trx0undo.h4
-rw-r--r--innobase/include/univ.i24
-rw-r--r--innobase/include/ut0dbg.h10
-rw-r--r--innobase/include/ut0ut.h3
-rw-r--r--innobase/lock/lock0lock.c40
-rw-r--r--innobase/log/log0log.c15
-rw-r--r--innobase/log/log0recv.c3
-rw-r--r--innobase/os/os0file.c84
-rw-r--r--innobase/page/page0cur.c8
-rw-r--r--innobase/pars/pars0pars.c4
-rw-r--r--innobase/rem/rem0cmp.c49
-rw-r--r--innobase/rem/rem0rec.c67
-rw-r--r--innobase/row/row0ins.c91
-rw-r--r--innobase/row/row0mysql.c42
-rw-r--r--innobase/row/row0purge.c99
-rw-r--r--innobase/row/row0row.c34
-rw-r--r--innobase/row/row0sel.c63
-rw-r--r--innobase/row/row0uins.c9
-rw-r--r--innobase/row/row0umod.c121
-rw-r--r--innobase/row/row0undo.c12
-rw-r--r--innobase/row/row0upd.c143
-rw-r--r--innobase/srv/srv0srv.c160
-rw-r--r--innobase/srv/srv0start.c133
-rw-r--r--innobase/sync/sync0rw.c5
-rw-r--r--innobase/sync/sync0sync.c38
-rw-r--r--innobase/trx/trx0purge.c7
-rw-r--r--innobase/trx/trx0rec.c62
-rw-r--r--innobase/trx/trx0sys.c319
62 files changed, 3146 insertions, 517 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c
index 2507f805cd6..af2029bf1e8 100644
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -71,30 +71,6 @@ btr_page_create(
dict_tree_t* tree, /* in: index tree */
mtr_t* mtr); /* in: mtr */
/******************************************************************
-Allocates a new file page to be used in an index tree. */
-static
-page_t*
-btr_page_alloc(
-/*===========*/
- /* out: new allocated page,
- x-latched */
- dict_tree_t* tree, /* in: index tree */
- ulint hint_page_no, /* in: hint of a good page */
- byte file_direction, /* in: direction where a possible
- page split is made */
- ulint level, /* in: level where the page is placed
- in the tree */
- mtr_t* mtr); /* in: mtr */
-/******************************************************************
-Frees a file page used in an index tree. */
-static
-void
-btr_page_free(
-/*==========*/
- dict_tree_t* tree, /* in: index tree */
- page_t* page, /* in, own: page to be freed */
- mtr_t* mtr); /* in: mtr */
-/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
void
@@ -319,11 +295,12 @@ btr_page_alloc_for_ibuf(
/******************************************************************
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents! */
-static
+
page_t*
btr_page_alloc(
/*===========*/
- /* out: new allocated page, x-latched */
+ /* out: new allocated page, x-latched;
+ NULL if out of space */
dict_tree_t* tree, /* in: index tree */
ulint hint_page_no, /* in: hint of a good page */
byte file_direction, /* in: direction where a possible
@@ -358,7 +335,10 @@ btr_page_alloc(
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
file_direction, TRUE, mtr);
- ut_a(new_page_no != FIL_NULL);
+ if (new_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
new_page = buf_page_get(dict_tree_get_space(tree), new_page_no,
RW_X_LATCH, mtr);
@@ -435,20 +415,22 @@ btr_page_free_for_ibuf(
}
/******************************************************************
-Frees a file page used in an index tree. */
-static
+Frees a file page used in an index tree. Can be used also to (BLOB)
+external storage pages, because the page level 0 can be given as an
+argument. */
+
void
-btr_page_free(
-/*==========*/
+btr_page_free_low(
+/*==============*/
dict_tree_t* tree, /* in: index tree */
page_t* page, /* in: page to be freed, x-latched */
+ ulint level, /* in: page level */
mtr_t* mtr) /* in: mtr */
{
fseg_header_t* seg_header;
page_t* root;
ulint space;
ulint page_no;
- ulint level;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
@@ -465,8 +447,6 @@ btr_page_free(
}
root = btr_root_get(tree, mtr);
-
- level = btr_page_get_level(page, mtr);
if (level == 0) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
@@ -481,6 +461,26 @@ btr_page_free(
}
/******************************************************************
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+
+void
+btr_page_free(
+/*==========*/
+ dict_tree_t* tree, /* in: index tree */
+ page_t* page, /* in: page to be freed, x-latched */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint level;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+ MTR_MEMO_PAGE_X_FIX));
+ level = btr_page_get_level(page, mtr);
+
+ btr_page_free_low(tree, page, level, mtr);
+}
+
+/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
void
@@ -1276,6 +1276,7 @@ btr_insert_on_non_leaf_level(
dtuple_t* tuple, /* in: the record to be inserted */
mtr_t* mtr) /* in: mtr */
{
+ big_rec_t* dummy_big_rec;
btr_cur_t cursor;
ulint err;
rec_t* rec;
@@ -1294,7 +1295,7 @@ btr_insert_on_non_leaf_level(
| BTR_KEEP_SYS_FLAG
| BTR_NO_UNDO_LOG_FLAG,
&cursor, tuple,
- &rec, NULL, mtr);
+ &rec, &dummy_big_rec, NULL, mtr);
ut_a(err == DB_SUCCESS);
}
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
index a8680c6b380..e8ff88c6f4f 100644
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -12,7 +12,7 @@ many pages in the tablespace before we start the operation, because
if leaf splitting has been started, it is difficult to undo, except
by crashing the database and doing a roll-forward.
-(c) 1994-1996 Innobase Oy
+(c) 1994-2001 Innobase Oy
Created 10/16/1994 Heikki Tuuri
*******************************************************/
@@ -49,6 +49,15 @@ can be released by page reorganize, then it is reorganized */
this many index pages */
#define BTR_KEY_VAL_ESTIMATE_N_PAGES 8
+/* The structure of a BLOB part header */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN 0 /* BLOB part len on this
+ page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /* next BLOB part page no,
+ FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE 8
+
/***********************************************************************
Adds path information to the cursor for the current page, for which
the binary search has been performed. */
@@ -60,6 +69,19 @@ btr_cur_add_path_info(
ulint height, /* in: height of the page in tree;
0 means leaf node */
ulint root_height); /* in: root node height in tree */
+/***************************************************************
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /* in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /* in: record */
+ upd_t* update, /* in: update vector */
+ mtr_t* mtr); /* in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
/*==================== B-TREE SEARCH =========================*/
@@ -745,9 +767,13 @@ btr_cur_optimistic_insert(
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr) /* in: mtr */
{
+ big_rec_t* big_rec_vec = NULL;
dict_index_t* index;
page_cur_t* page_cursor;
page_t* page;
@@ -764,6 +790,8 @@ btr_cur_optimistic_insert(
ut_ad(dtuple_check_typed(entry));
+ *big_rec = NULL;
+
page = btr_cur_get_page(cursor);
index = cursor->index;
@@ -772,15 +800,27 @@ btr_cur_optimistic_insert(
max_size = page_get_max_insert_size_after_reorganize(page, 1);
level = btr_page_get_level(page, mtr);
+calculate_sizes_again:
/* Calculate the record size when entry is converted to a record */
data_size = dtuple_get_data_size(entry);
extra_size = rec_get_converted_extra_size(data_size,
dtuple_get_n_fields(entry));
rec_size = data_size + extra_size;
- if (rec_size >= page_get_free_space_of_empty() / 2) {
+ if ((rec_size >= page_get_free_space_of_empty() / 2)
+ || (rec_size >= REC_MAX_DATA_SIZE)) {
+
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+
+ big_rec_vec = dtuple_convert_big_rec(index, entry);
+
+ if (big_rec_vec == NULL) {
+
+ return(DB_TOO_BIG_RECORD);
+ }
- return(DB_TOO_BIG_RECORD);
+ goto calculate_sizes_again;
}
/* If there have been many consecutive inserts, and we are on the leaf
@@ -795,7 +835,11 @@ btr_cur_optimistic_insert(
&& (0 == level)
&& (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
-
+
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
return(DB_FAIL);
}
@@ -804,6 +848,9 @@ btr_cur_optimistic_insert(
|| (page_get_max_insert_size(page, 1) >= rec_size)
|| (page_get_n_recs(page) <= 1))) {
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
return(DB_FAIL);
}
@@ -812,6 +859,9 @@ btr_cur_optimistic_insert(
if (err != DB_SUCCESS) {
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
return(err);
}
@@ -835,6 +885,19 @@ btr_cur_optimistic_insert(
*rec = page_cur_tuple_insert(page_cursor, entry, mtr);
+ if (!(*rec)) {
+ char* err_buf = mem_alloc(1000);
+
+ dtuple_sprintf(err_buf, 900, entry);
+
+ fprintf(stderr,
+ "InnoDB: Error: cannot insert tuple %s to index %s of table %s\n"
+ "InnoDB: max insert size %lu\n",
+ err_buf, index->name, index->table->name, max_size);
+
+ mem_free(err_buf);
+ }
+
ut_a(*rec); /* <- We calculated above the record would fit */
}
@@ -845,6 +908,7 @@ btr_cur_optimistic_insert(
btr_search_update_hash_on_insert(cursor);
}
#endif
+
if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
lock_update_insert(*rec);
@@ -860,6 +924,8 @@ btr_cur_optimistic_insert(
rec_size + PAGE_DIR_SLOT_SIZE);
}
+ *big_rec = big_rec_vec;
+
return(DB_SUCCESS);
}
@@ -884,17 +950,24 @@ btr_cur_pessimistic_insert(
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr) /* in: mtr */
{
- page_t* page;
- ulint err;
- ibool dummy_inh;
- ibool success;
- ulint n_extents = 0;
+ dict_index_t* index = cursor->index;
+ big_rec_t* big_rec_vec = NULL;
+ page_t* page;
+ ulint err;
+ ibool dummy_inh;
+ ibool success;
+ ulint n_extents = 0;
ut_ad(dtuple_check_typed(entry));
+ *big_rec = NULL;
+
page = btr_cur_get_page(cursor);
ut_ad(mtr_memo_contains(mtr,
@@ -908,8 +981,8 @@ btr_cur_pessimistic_insert(
cursor->flag = BTR_CUR_BINARY;
- err = btr_cur_optimistic_insert(flags, cursor, entry, rec, thr, mtr);
-
+ err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec,
+ thr, mtr);
if (err != DB_FAIL) {
return(err);
@@ -932,7 +1005,7 @@ btr_cur_pessimistic_insert(
n_extents = cursor->tree_height / 16 + 3;
- success = fsp_reserve_free_extents(cursor->index->space,
+ success = fsp_reserve_free_extents(index->space,
n_extents, FSP_NORMAL, mtr);
if (!success) {
err = DB_OUT_OF_FILE_SPACE;
@@ -941,7 +1014,22 @@ btr_cur_pessimistic_insert(
}
}
- if (dict_tree_get_page(cursor->index->tree)
+ if ((rec_get_converted_size(entry)
+ >= page_get_free_space_of_empty() / 2)
+ || (rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE)) {
+
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+
+ big_rec_vec = dtuple_convert_big_rec(index, entry);
+
+ if (big_rec_vec == NULL) {
+
+ return(DB_TOO_BIG_RECORD);
+ }
+ }
+
+ if (dict_tree_get_page(index->tree)
== buf_frame_get_page_no(page)) {
/* The page is the root page */
@@ -950,7 +1038,7 @@ btr_cur_pessimistic_insert(
*rec = btr_page_split_and_insert(cursor, entry, mtr);
}
- btr_cur_position(cursor->index, page_rec_get_prev(*rec), cursor);
+ btr_cur_position(index, page_rec_get_prev(*rec), cursor);
#ifdef BTR_CUR_ADAPT
btr_search_update_hash_on_insert(cursor);
@@ -963,9 +1051,11 @@ btr_cur_pessimistic_insert(
err = DB_SUCCESS;
if (n_extents > 0) {
- fil_space_release_free_extents(cursor->index->space, n_extents);
+ fil_space_release_free_extents(index->space, n_extents);
}
-
+
+ *big_rec = big_rec_vec;
+
return(err);
}
@@ -1227,7 +1317,8 @@ btr_cur_optimistic_update(
dulint roll_ptr;
trx_t* trx;
mem_heap_t* heap;
- ibool reorganized = FALSE;
+ ibool reorganized = FALSE;
+ ulint i;
/* Only clustered index records are updated using this function */
ut_ad((cursor->index)->type & DICT_CLUSTERED);
@@ -1247,6 +1338,23 @@ btr_cur_optimistic_update(
cmpl_info, thr, mtr));
}
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ if (upd_get_nth_field(update, i)->extern_storage) {
+
+ /* Externally stored fields are treated in pessimistic
+ update */
+
+ return(DB_OVERFLOW);
+ }
+ }
+
+ if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) {
+ /* Externally stored fields are treated in pessimistic
+ update */
+
+ return(DB_OVERFLOW);
+ }
+
page_cursor = btr_cur_get_page_cur(cursor);
heap = mem_heap_create(1024);
@@ -1260,9 +1368,9 @@ btr_cur_optimistic_update(
if (new_rec_size >= page_get_free_space_of_empty() / 2) {
- mem_heap_free(heap);
+ mem_heap_free(heap);
- return(DB_TOO_BIG_RECORD);
+ return(DB_OVERFLOW);
}
max_size = old_rec_size
@@ -1377,6 +1485,48 @@ btr_cur_pess_upd_restore_supremum(
rec);
}
+/***************************************************************
+Replaces and copies the data in the new column values stored in the
+update vector to the clustered index entry given. */
+static
+void
+btr_cur_copy_new_col_vals(
+/*======================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ upd_t* update, /* in: update vector */
+ mem_heap_t* heap) /* in: heap where data is copied */
+{
+ upd_field_t* upd_field;
+ dfield_t* dfield;
+ dfield_t* new_val;
+ ulint field_no;
+ byte* data;
+ ulint i;
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ field_no = upd_field->field_no;
+
+ dfield = dtuple_get_nth_field(entry, field_no);
+
+ new_val = &(upd_field->new_val);
+
+ if (new_val->len == UNIV_SQL_NULL) {
+ data = NULL;
+ } else {
+ data = mem_heap_alloc(heap, new_val->len);
+
+ ut_memcpy(data, new_val->data, new_val->len);
+ }
+
+ dfield_set_data(dfield, data, new_val->len);
+ }
+}
+
/*****************************************************************
Performs an update of a record on a page of a tree. It is assumed
that mtr holds an x-latch on the tree and on the cursor page. If the
@@ -1389,8 +1539,9 @@ btr_cur_pessimistic_update(
/* out: DB_SUCCESS or error code */
ulint flags, /* in: undo logging, locking, and rollback
flags */
- btr_cur_t* cursor, /* in: cursor on the record to update;
- cursor does not stay valid */
+ btr_cur_t* cursor, /* in: cursor on the record to update */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or NULL */
upd_t* update, /* in: update vector; this is allowed also
contain trx id and roll ptr fields, but
the values in update vector have no effect */
@@ -1399,6 +1550,8 @@ btr_cur_pessimistic_update(
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in: mtr */
{
+ big_rec_t* big_rec_vec = NULL;
+ big_rec_t* dummy_big_rec;
dict_index_t* index;
page_t* page;
dict_tree_t* tree;
@@ -1414,6 +1567,11 @@ btr_cur_pessimistic_update(
ibool was_first;
ibool success;
ulint n_extents = 0;
+ ulint* ext_vect;
+ ulint n_ext_vect;
+ ulint reserve_flag;
+
+ *big_rec = NULL;
page = btr_cur_get_page(cursor);
rec = btr_cur_get_rec(cursor);
@@ -1449,8 +1607,14 @@ btr_cur_pessimistic_update(
n_extents = cursor->tree_height / 16 + 3;
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ reserve_flag = FSP_CLEANING;
+ } else {
+ reserve_flag = FSP_NORMAL;
+ }
+
success = fsp_reserve_free_extents(cursor->index->space,
- n_extents, FSP_NORMAL, mtr);
+ n_extents, reserve_flag, mtr);
if (!success) {
err = DB_OUT_OF_FILE_SPACE;
@@ -1464,7 +1628,7 @@ btr_cur_pessimistic_update(
new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
- row_upd_clust_index_replace_new_col_vals(new_entry, update);
+ btr_cur_copy_new_col_vals(new_entry, update, heap);
if (!(flags & BTR_KEEP_SYS_FLAG)) {
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
@@ -1487,17 +1651,49 @@ btr_cur_pessimistic_update(
lock_rec_store_on_page_infimum(rec);
btr_search_update_hash_on_delete(cursor);
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ /* We are in a transaction rollback undoing a row
+ update: we must free possible externally stored fields
+ which got new values in the update */
+
+ ut_a(big_rec_vec == NULL);
+
+ btr_rec_free_updated_extern_fields(index, rec, update, mtr);
+ }
+
+ /* We have to set appropriate extern storage bits in the new
+ record to be inserted: we have to remember which fields were such */
+
+ ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec));
+ n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update);
+
page_cur_delete_rec(page_cursor, mtr);
page_cur_move_to_prev(page_cursor);
- if (optim_err == DB_UNDERFLOW) {
- rec = btr_cur_insert_if_possible(cursor, new_entry,
+ if ((rec_get_converted_size(new_entry) >=
+ page_get_free_space_of_empty() / 2)
+ || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) {
+
+ big_rec_vec = dtuple_convert_big_rec(index, new_entry);
+
+ if (big_rec_vec == NULL) {
+
+ mem_heap_free(heap);
+
+ goto return_after_reservations;
+ }
+ }
+
+ rec = btr_cur_insert_if_possible(cursor, new_entry,
&dummy_reorganized, mtr);
- ut_a(rec); /* <- We knew the insert would fit */
+ ut_a(rec || optim_err != DB_UNDERFLOW);
+ if (rec) {
lock_rec_restore_from_page_infimum(rec, page);
-
+ rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr);
+
btr_cur_compress_if_useful(cursor, mtr);
err = DB_SUCCESS;
@@ -1521,9 +1717,13 @@ btr_cur_pessimistic_update(
err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
| BTR_NO_LOCKING_FLAG
| BTR_KEEP_SYS_FLAG,
- cursor, new_entry, &rec, NULL, mtr);
+ cursor, new_entry, &rec,
+ &dummy_big_rec, NULL, mtr);
ut_a(rec);
ut_a(err == DB_SUCCESS);
+ ut_a(dummy_big_rec == NULL);
+
+ rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr);
lock_rec_restore_from_page_infimum(rec, page);
@@ -1541,9 +1741,12 @@ btr_cur_pessimistic_update(
return_after_reservations:
if (n_extents > 0) {
- fil_space_release_free_extents(cursor->index->space, n_extents);
+ fil_space_release_free_extents(cursor->index->space,
+ n_extents);
}
+ *big_rec = big_rec_vec;
+
return(err);
}
@@ -1932,6 +2135,11 @@ btr_cur_optimistic_delete(
ut_ad(btr_page_get_level(page, mtr) == 0);
+ if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) {
+
+ return(FALSE);
+ }
+
if (btr_cur_can_delete_without_compress(cursor, mtr)) {
lock_update_delete(btr_cur_get_rec(cursor));
@@ -2009,6 +2217,8 @@ btr_cur_pessimistic_delete(
}
}
+ btr_rec_free_externally_stored_fields(cursor->index,
+ btr_cur_get_rec(cursor), mtr);
if ((page_get_n_recs(page) < 2)
&& (dict_tree_get_page(btr_cur_get_tree(cursor))
!= buf_frame_get_page_no(page))) {
@@ -2079,7 +2289,7 @@ return_after_reservations:
fil_space_release_free_extents(cursor->index->space, n_extents);
}
- return(ret);
+ return(ret);
}
/***********************************************************************
@@ -2292,3 +2502,553 @@ btr_estimate_number_of_different_key_vals(
return(index->table->stat_n_rows / (total_n_recs / n_diff));
}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************************
+Stores the positions of the fields marked as extern storage in the update
+vector, and also those fields who are marked as extern storage in rec
+and not mentioned in updated fields. We use this function to remember
+which fields we must mark as extern storage in a record inserted for an
+update. */
+
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+ /* out: number of values stored in ext_vect */
+ ulint* ext_vect, /* in: array of ulints, must be preallocated
+ to have space for all fields in rec */
+ rec_t* rec, /* in: record */
+ upd_t* update) /* in: update vector or NULL */
+{
+ ulint n_pushed = 0;
+ ibool is_updated;
+ ulint n;
+ ulint j;
+ ulint i;
+
+ if (update) {
+ n = upd_get_n_fields(update);
+
+ for (i = 0; i < n; i++) {
+
+ if (upd_get_nth_field(update, i)->extern_storage) {
+
+ ext_vect[n_pushed] =
+ upd_get_nth_field(update, i)->field_no;
+
+ n_pushed++;
+ }
+ }
+ }
+
+ n = rec_get_n_fields(rec);
+
+ for (i = 0; i < n; i++) {
+ if (rec_get_nth_field_extern_bit(rec, i)) {
+
+ /* Check it is not in updated fields */
+ is_updated = FALSE;
+
+ if (update) {
+ for (j = 0; j < upd_get_n_fields(update);
+ j++) {
+ if (upd_get_nth_field(update, j)
+ ->field_no == i) {
+ is_updated = TRUE;
+ }
+ }
+ }
+
+ if (!is_updated) {
+ ext_vect[n_pushed] = i;
+ n_pushed++;
+ }
+ }
+ }
+
+ return(n_pushed);
+}
+
+/***********************************************************************
+Returns the length of a BLOB part stored on the header page. */
+static
+ulint
+btr_blob_get_part_len(
+/*==================*/
+ /* out: part length */
+ byte* blob_header) /* in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/***********************************************************************
+Returns the page number where the next BLOB part is stored. */
+static
+ulint
+btr_blob_get_next_page_no(
+/*======================*/
+ /* out: page number or FIL_NULL if
+ no more pages */
+ byte* blob_header) /* in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/***********************************************************************
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The fields are stored on pages allocated from leaf node
+file segment of the index tree. */
+
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+ /* out: DB_SUCCESS or error */
+ dict_index_t* index, /* in: index of rec; the index tree
+ MUST be X-latched */
+ rec_t* rec, /* in: record */
+ big_rec_t* big_rec_vec, /* in: vector containing fields
+ to be stored externally */
+ mtr_t* local_mtr) /* in: mtr containing the latch to
+ rec and to the tree */
+{
+ byte* data;
+ ulint local_len;
+ ulint extern_len;
+ ulint store_len;
+ ulint page_no;
+ page_t* page;
+ ulint space_id;
+ page_t* prev_page;
+ page_t* rec_page;
+ ulint prev_page_no;
+ ulint hint_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_a(index->type & DICT_CLUSTERED);
+
+ space_id = buf_frame_get_space_id(rec);
+
+ /* We have to create a file segment to the tablespace
+ for each field and put the pointer to the field in rec */
+
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+
+ data = rec_get_nth_field(rec, big_rec_vec->fields[i].field_no,
+ &local_len);
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+ extern_len = big_rec_vec->fields[i].len;
+
+ ut_a(extern_len > 0);
+
+ prev_page_no = FIL_NULL;
+
+ while (extern_len > 0) {
+ mtr_start(&mtr);
+
+ if (prev_page_no == FIL_NULL) {
+ hint_page_no = buf_frame_get_page_no(rec) + 1;
+ } else {
+ hint_page_no = prev_page_no + 1;
+ }
+
+ page = btr_page_alloc(index->tree, hint_page_no,
+ FSP_NO_DIR, 0, &mtr);
+ if (page == NULL) {
+
+ mtr_commit(&mtr);
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ page_no = buf_frame_get_page_no(page);
+
+ if (prev_page_no != FIL_NULL) {
+ prev_page = buf_page_get(space_id,
+ prev_page_no,
+ RW_X_LATCH, &mtr);
+
+ buf_page_dbg_add_level(prev_page,
+ SYNC_EXTERN_STORAGE);
+
+ mlog_write_ulint(prev_page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO,
+ page_no, MLOG_4BYTES, &mtr);
+ }
+
+ if (extern_len > (UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - BTR_BLOB_HDR_SIZE
+ - FIL_PAGE_DATA_END)) {
+ store_len = UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - BTR_BLOB_HDR_SIZE
+ - FIL_PAGE_DATA_END;
+ } else {
+ store_len = extern_len;
+ }
+
+ mlog_write_string(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_SIZE,
+ big_rec_vec->fields[i].data
+ + big_rec_vec->fields[i].len
+ - extern_len,
+ store_len, &mtr);
+ mlog_write_ulint(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN,
+ store_len, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO,
+ FIL_NULL, MLOG_4BYTES, &mtr);
+
+ extern_len -= store_len;
+
+ rec_page = buf_page_get(space_id,
+ buf_frame_get_page_no(data),
+ RW_X_LATCH, &mtr);
+
+ buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
+
+ mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
+ big_rec_vec->fields[i].len
+ - extern_len,
+ MLOG_4BYTES, &mtr);
+
+ if (prev_page_no == FIL_NULL) {
+ mlog_write_ulint(data + local_len
+ + BTR_EXTERN_SPACE_ID,
+ space_id,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(data + local_len
+ + BTR_EXTERN_PAGE_NO,
+ page_no,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(data + local_len
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_DATA,
+ MLOG_4BYTES, &mtr);
+
+ /* Set the bit denoting that this field
+ in rec is stored externally */
+
+ rec_set_nth_field_extern_bit(rec,
+ big_rec_vec->fields[i].field_no,
+ TRUE, &mtr);
+ }
+
+ prev_page_no = page_no;
+
+ mtr_commit(&mtr);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************************
+Frees the space in an externally stored field to the file space
+management. */
+
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /* in: index of the data, the index
+ tree MUST be X-latched */
+ byte* data, /* in: internally stored data
+ + reference to the externally
+ stored part */
+ ulint local_len, /* in: length of data */
+ mtr_t* local_mtr) /* in: mtr containing the latch to
+ data an an X-latch to the index
+ tree */
+{
+ page_t* page;
+ page_t* rec_page;
+ ulint space_id;
+ ulint page_no;
+ ulint offset;
+ ulint extern_len;
+ ulint next_page_no;
+ ulint part_len;
+ mtr_t mtr;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ for (;;) {
+ mtr_start(&mtr);
+
+ rec_page = buf_page_get(buf_frame_get_space_id(data),
+ buf_frame_get_page_no(data), RW_X_LATCH, &mtr);
+
+ buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
+
+ space_id = mach_read_from_4(data + local_len
+ + BTR_EXTERN_SPACE_ID);
+
+ page_no = mach_read_from_4(data + local_len
+ + BTR_EXTERN_PAGE_NO);
+
+ offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+
+ extern_len = mach_read_from_4(data + local_len
+ + BTR_EXTERN_LEN + 4);
+
+ /* If extern len is 0, then there is no external storage data
+ at all */
+
+ if (extern_len == 0) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ page = buf_page_get(space_id, page_no, RW_X_LATCH, &mtr);
+
+ buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE);
+
+ next_page_no = mach_read_from_4(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO);
+
+ part_len = btr_blob_get_part_len(page + FIL_PAGE_DATA);
+
+ ut_a(extern_len >= part_len);
+
+ /* We must supply the page level (= 0) as an argument
+ because we did not store it on the page (we save the space
+ overhead from an index page header. */
+
+ btr_page_free_low(index->tree, page, 0, &mtr);
+
+ mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO,
+ next_page_no,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
+ extern_len - part_len,
+ MLOG_4BYTES, &mtr);
+ if (next_page_no == FIL_NULL) {
+ ut_a(extern_len - part_len == 0);
+ }
+
+ if (extern_len - part_len == 0) {
+ ut_a(next_page_no == FIL_NULL);
+ }
+
+ mtr_commit(&mtr);
+ }
+}
+
+/***************************************************************
+Frees the externally stored fields for a record. */
+
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /* in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /* in: record */
+ mtr_t* mtr) /* in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+{
+ ulint n_fields;
+ byte* data;
+ ulint len;
+ ulint i;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_X_FIX));
+ if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) {
+
+ return;
+ }
+
+ /* Free possible externally stored fields in the record */
+
+ n_fields = rec_get_n_fields(rec);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_get_nth_field_extern_bit(rec, i)) {
+
+ data = rec_get_nth_field(rec, i, &len);
+ btr_free_externally_stored_field(index, data, len, mtr);
+ }
+ }
+}
+
+/***************************************************************
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /* in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /* in: record */
+ upd_t* update, /* in: update vector */
+ mtr_t* mtr) /* in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+{
+ upd_field_t* ufield;
+ ulint n_fields;
+ byte* data;
+ ulint len;
+ ulint i;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(rec),
+ MTR_MEMO_PAGE_X_FIX));
+ if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) {
+
+ return;
+ }
+
+ /* Free possible externally stored fields in the record */
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ ufield = upd_get_nth_field(update, i);
+
+ if (rec_get_nth_field_extern_bit(rec, ufield->field_no)) {
+
+ data = rec_get_nth_field(rec, ufield->field_no, &len);
+ btr_free_externally_stored_field(index, data, len, mtr);
+ }
+ }
+}
+
+/***********************************************************************
+Copies an externally stored field of a record to mem heap. Parameter
+data contains a pointer to 'internally' stored part of the field:
+possibly some data, and the reference to the externally stored part in
+the last 20 bytes of data. */
+
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+ /* out: the whole field copied to heap */
+ ulint* len, /* out: length of the whole field */
+ byte* data, /* in: 'internally' stored part of the
+ field containing also the reference to
+ the external part */
+ ulint local_len,/* in: length of data */
+ mem_heap_t* heap) /* in: mem heap */
+{
+ page_t* page;
+ ulint space_id;
+ ulint page_no;
+ ulint offset;
+ ulint extern_len;
+ byte* blob_header;
+ ulint part_len;
+ byte* buf;
+ ulint copied_len;
+ mtr_t mtr;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
+
+ page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
+
+ offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+
+ /* Currently a BLOB cannot be bigger that 4 GB; we
+ leave the 4 upper bytes in the length field unused */
+
+ extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
+
+ buf = mem_heap_alloc(heap, local_len + extern_len);
+
+ ut_memcpy(buf, data, local_len);
+ copied_len = local_len;
+
+ if (extern_len == 0) {
+ *len = copied_len;
+
+ return(buf);
+ }
+
+ for (;;) {
+ mtr_start(&mtr);
+
+ page = buf_page_get(space_id, page_no, RW_S_LATCH, &mtr);
+
+ buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE);
+
+ blob_header = page + offset;
+
+ part_len = btr_blob_get_part_len(blob_header);
+
+ ut_memcpy(buf + copied_len, blob_header + BTR_BLOB_HDR_SIZE,
+ part_len);
+ copied_len += part_len;
+
+ page_no = btr_blob_get_next_page_no(blob_header);
+
+ /* On other BLOB pages except the first the BLOB header
+ always is at the page data start: */
+
+ offset = FIL_PAGE_DATA;
+
+ mtr_commit(&mtr);
+
+ if (page_no == FIL_NULL) {
+ ut_a(copied_len == local_len + extern_len);
+
+ *len = copied_len;
+
+ return(buf);
+ }
+
+ ut_a(copied_len < local_len + extern_len);
+ }
+}
+
+/***********************************************************************
+Copies an externally stored field of a record to mem heap. */
+
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+ /* out: the field copied to heap */
+ rec_t* rec, /* in: record */
+ ulint no, /* in: field number */
+ ulint* len, /* out: length of the field */
+ mem_heap_t* heap) /* in: mem heap */
+{
+ ulint local_len;
+ byte* data;
+
+ ut_a(rec_get_nth_field_extern_bit(rec, no));
+
+ /* An externally stored field can contain some initial
+ data from the field, and in the last 20 bytes it has the
+ space id, page number, and offset where the rest of the
+ field data is stored, and the data length in addition to
+ the data stored locally. We may need to store some data
+ locally to get the local record length above the 128 byte
+ limit so that field offsets are stored in two bytes, and
+ the extern bit is available in those two bytes. */
+
+ data = rec_get_nth_field(rec, no, &local_len);
+
+ return(btr_copy_externally_stored_field(len, data, local_len, heap));
+}
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index ede9e621462..3fabe6c6d0e 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -216,14 +216,44 @@ buf_calc_page_checksum(
/* out: checksum */
byte* page) /* in: buffer page */
{
- ulint checksum;
+ ulint checksum;
- checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
- + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA
- - FIL_PAGE_END_LSN);
- checksum = checksum & 0xFFFFFFFF;
+ checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN);
+ checksum = checksum & 0xFFFFFFFF;
- return(checksum);
+ return(checksum);
+}
+
+/************************************************************************
+Checks if a page is corrupt. */
+
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ /* out: TRUE if corrupted */
+ byte* read_buf) /* in: a database page */
+{
+ ulint checksum;
+
+ checksum = buf_calc_page_checksum(read_buf);
+
+ if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+ != mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN + 4))
+ || (checksum != mach_read_from_4(read_buf
+ + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN)
+ && mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ != mach_read_from_4(read_buf
+ + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN))) {
+ return(TRUE);
+ }
+
+ return(FALSE);
}
/************************************************************************
@@ -1265,34 +1295,22 @@ buf_page_io_complete(
dulint id;
dict_index_t* index;
ulint io_type;
- ulint checksum;
ut_ad(block);
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
- checksum = buf_calc_page_checksum(block->frame);
-
/* From version 3.23.38 up we store the page checksum
to the 4 upper bytes of the page end lsn field */
- if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
- != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN + 4))
- || (checksum != mach_read_from_4(block->frame
- + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN)
- && mach_read_from_4(block->frame + FIL_PAGE_LSN)
- != mach_read_from_4(block->frame
- + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN))) {
- fprintf(stderr,
+ if (buf_page_is_corrupted(block->frame)) {
+ fprintf(stderr,
"InnoDB: Database page corruption or a failed\n"
"InnoDB: file read of page %lu.\n", block->offset);
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
- exit(1);
+ exit(1);
}
if (recv_recovery_is_on()) {
@@ -1601,11 +1619,28 @@ void
buf_print_io(void)
/*==============*/
{
+ ulint size;
+
ut_ad(buf_pool);
+ size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+
mutex_enter(&(buf_pool->mutex));
+
+ printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
+ printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free));
+ printf("Flush list length %lu \n",
+ UT_LIST_GET_LEN(buf_pool->flush_list));
+ printf("Buffer pool size in pages %lu\n", size);
- printf("pages read %lu, created %lu, written %lu\n",
+ printf("Pending reads %lu \n", buf_pool->n_pend_reads);
+
+ printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ buf_pool->n_flush[BUF_FLUSH_LRU],
+ buf_pool->n_flush[BUF_FLUSH_LIST],
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+ printf("Pages read %lu, created %lu, written %lu\n",
buf_pool->n_pages_read, buf_pool->n_pages_created,
buf_pool->n_pages_written);
mutex_exit(&(buf_pool->mutex));
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 7129b8d20a9..0f27cee45a5 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -1,7 +1,7 @@
/******************************************************
The database buffer buf_pool flush algorithm
-(c) 1995 Innobase Oy
+(c) 1995-2001 Innobase Oy
Created 11/11/1995 Heikki Tuuri
*******************************************************/
@@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
#include "fil0fil.h"
-
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
@@ -195,9 +194,145 @@ buf_flush_write_complete(
}
/************************************************************************
-Does an asynchronous write of a buffer page. NOTE: in simulated aio we must
-call os_aio_simulated_wake_handler_threads after we have posted a batch
-of writes! */
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+static
+void
+buf_flush_buffered_writes(void)
+/*===========================*/
+{
+ buf_block_t* block;
+ ulint len;
+ ulint i;
+
+ if (trx_doublewrite == NULL) {
+ os_aio_simulated_wake_handler_threads();
+
+ return;
+ }
+
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ /* Write first to doublewrite buffer blocks. We use synchronous
+ aio and thus know that file write has been completed when the
+ control returns. */
+
+ if (trx_doublewrite->first_free == 0) {
+
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ return;
+ }
+
+ if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+ } else {
+ len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE,
+ TRUE, TRX_SYS_SPACE,
+ trx_doublewrite->block1, 0, len,
+ (void*)trx_doublewrite->write_buf, NULL);
+
+ if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ len = (trx_doublewrite->first_free
+ - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
+
+ fil_io(OS_FILE_WRITE,
+ TRUE, TRX_SYS_SPACE,
+ trx_doublewrite->block2, 0, len,
+ (void*)(trx_doublewrite->write_buf
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
+ NULL);
+ }
+
+ /* Now flush the doublewrite buffer data to disk */
+
+ fil_flush(TRX_SYS_SPACE);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+ blocks. Next do the writes to the intended positions. */
+
+ for (i = 0; i < trx_doublewrite->first_free; i++) {
+ block = trx_doublewrite->buf_block_arr[i];
+
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
+ (void*)block->frame, (void*)block);
+ }
+
+ /* Wake possible simulated aio thread to actually post the
+ writes to the operating system */
+
+ os_aio_simulated_wake_handler_threads();
+
+ /* Wait that all async writes to tablespaces have been posted to
+ the OS */
+
+ os_aio_wait_until_no_pending_writes();
+
+ /* Now we flush the data to disk (for example, with fsync) */
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ /* We can now reuse the doublewrite memory buffer: */
+
+ trx_doublewrite->first_free = 0;
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/************************************************************************
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_flush_buffered_writes and waits for for free space to
+appear. */
+static
+void
+buf_flush_post_to_doublewrite_buf(
+/*==============================*/
+ buf_block_t* block) /* in: buffer block to write */
+{
+try_again:
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ goto try_again;
+ }
+
+ ut_memcpy(trx_doublewrite->write_buf
+ + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+ block->frame, UNIV_PAGE_SIZE);
+
+ trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
+
+ trx_doublewrite->first_free++;
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ return;
+ }
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/************************************************************************
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
@@ -222,15 +357,24 @@ buf_flush_write_block_low(
mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
block->newest_modification);
+ /* Write to the page the space id and page number */
+
+ mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space);
+ mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset);
+
/* We overwrite the first 4 bytes of the end lsn field to store
a page checksum */
mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
buf_calc_page_checksum(block->frame));
- fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
+ if (!trx_doublewrite) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
+ } else {
+ buf_flush_post_to_doublewrite_buf(block);
+ }
}
/************************************************************************
@@ -251,14 +395,14 @@ buf_flush_try_page(
buf_block_t* block;
ibool locked;
- ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)
- || (flush_type == BUF_FLUSH_SINGLE_PAGE));
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
+ || flush_type == BUF_FLUSH_SINGLE_PAGE);
mutex_enter(&(buf_pool->mutex));
block = buf_page_hash_get(space, offset);
- if ((flush_type == BUF_FLUSH_LIST)
+ if (flush_type == BUF_FLUSH_LIST
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -286,7 +430,7 @@ buf_flush_try_page(
mutex_exit(&(buf_pool->mutex));
if (!locked) {
- os_aio_simulated_wake_handler_threads();
+ buf_flush_buffered_writes();
rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
}
@@ -300,7 +444,7 @@ buf_flush_try_page(
return(1);
- } else if ((flush_type == BUF_FLUSH_LRU) && block
+ } else if (flush_type == BUF_FLUSH_LRU && block
&& buf_flush_ready_for_flush(block, flush_type)) {
/* VERY IMPORTANT:
@@ -328,7 +472,7 @@ buf_flush_try_page(
return(1);
- } else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -387,6 +531,14 @@ buf_flush_try_neighbors(
low = offset;
high = offset + 1;
+ } else if (flush_type == BUF_FLUSH_LIST) {
+ /* Since semaphore waits require us to flush the
+ doublewrite buffer to disk, it is best that the
+ search area is just the page itself, to minimize
+ chances for semaphore waits */
+
+ low = offset;
+ high = offset + 1;
}
/* printf("Flush area: low %lu high %lu\n", low, high); */
@@ -418,13 +570,6 @@ buf_flush_try_neighbors(
mutex_exit(&(buf_pool->mutex));
- /* In simulated aio we wake up the i/o-handler threads now that
- we have posted a batch of writes: */
-
- /* printf("Flush count %lu ; Waking i/o handlers\n", count); */
-
- os_aio_simulated_wake_handler_threads();
-
return(count);
}
@@ -565,13 +710,15 @@ buf_flush_batch(
mutex_exit(&(buf_pool->mutex));
- if (buf_debug_prints && (page_count > 0)) {
+ buf_flush_buffered_writes();
+
+ if (buf_debug_prints && page_count > 0) {
if (flush_type == BUF_FLUSH_LRU) {
- printf("To flush %lu pages in LRU flush\n",
+ printf("Flushed %lu pages in LRU flush\n",
page_count);
} else if (flush_type == BUF_FLUSH_LIST) {
- printf("To flush %lu pages in flush list flush\n",
- page_count, flush_type);
+ printf("Flushed %lu pages in flush list flush\n",
+ page_count);
} else {
ut_error;
}
diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
index 728bf4404b8..db187cdd896 100644
--- a/innobase/buf/buf0rea.c
+++ b/innobase/buf/buf0rea.c
@@ -49,7 +49,9 @@ ulint
buf_read_page_low(
/*==============*/
/* out: 1 if a read request was queued, 0 if the page
- already resided in buf_pool */
+ already resided in buf_pool or if the page is in
+ the doublewrite buffer blocks in which case it is never
+ read into the pool */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
@@ -63,6 +65,16 @@ buf_read_page_low(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
+ if (trx_doublewrite && space == TRX_SYS_SPACE
+ && ( (offset >= trx_doublewrite->block1
+ && offset < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (offset >= trx_doublewrite->block2
+ && offset < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+ return(0);
+ }
+
#ifdef UNIV_LOG_DEBUG
if (space % 2 == 1) {
/* We are updating a replicate space while holding the
diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c
index fe5611bc312..4172fb9c8ce 100644
--- a/innobase/data/data0data.c
+++ b/innobase/data/data0data.c
@@ -13,7 +13,10 @@ Created 5/30/1994 Heikki Tuuri
#endif
#include "ut0rnd.h"
-
+#include "rem0rec.h"
+#include "page0page.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
byte data_error; /* data pointers of tuple fields are initialized
to point here for error checking */
@@ -378,6 +381,172 @@ dtuple_sprintf(
return(len);
}
+/******************************************************************
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index. */
+
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ /* out, own: created big record vector,
+ NULL if we are not able to shorten
+ the entry enough, i.e., if there are
+ too many short fields in entry */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry) /* in: index entry */
+{
+ mem_heap_t* heap;
+ big_rec_t* vector;
+ dfield_t* dfield;
+ ulint size;
+ ulint n_fields;
+ ulint longest;
+ ulint longest_i;
+ ulint i;
+
+ size = rec_get_converted_size(entry);
+
+ heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+ * sizeof(big_rec_field_t) + 1000);
+
+ vector = mem_heap_alloc(heap, sizeof(big_rec_t));
+
+ vector->heap = heap;
+ vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
+ * sizeof(big_rec_field_t));
+
+ /* Decide which fields to shorten: the algorithm is to look for
+ the longest field which does not occur in the ordering part
+ of any index on the table */
+
+ n_fields = 0;
+
+ while ((rec_get_converted_size(entry)
+ >= page_get_free_space_of_empty() / 2)
+ || rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) {
+
+ longest = 0;
+ for (i = dict_index_get_n_unique_in_tree(index);
+ i < dtuple_get_n_fields(entry); i++) {
+
+ /* Skip over fields which are ordering in some index */
+
+ if (dict_field_get_col(
+ dict_index_get_nth_field(index, i))
+ ->ord_part == 0) {
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ if (dfield->len != UNIV_SQL_NULL &&
+ dfield->len > longest) {
+
+ longest = dfield->len;
+
+ longest_i = i;
+ }
+ }
+ }
+
+ if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10) {
+
+ /* Cannot shorten more */
+
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ /* Move data from field longest_i to big rec vector,
+ but do not let data size of the remaining entry
+ drop below 128 which is the limit for the 2-byte
+ offset storage format in a physical record */
+
+ dfield = dtuple_get_nth_field(entry, longest_i);
+ vector->fields[n_fields].field_no = longest_i;
+
+ if (dtuple_get_data_size(entry) - dfield->len
+ <= REC_1BYTE_OFFS_LIMIT) {
+ vector->fields[n_fields].len =
+ dtuple_get_data_size(entry)
+ - REC_1BYTE_OFFS_LIMIT;
+ /* Since dfield will contain at least
+ a 20-byte reference to the extern storage,
+ we know that the data size of entry will be
+ > REC_1BYTE_OFFS_LIMIT */
+ } else {
+ vector->fields[n_fields].len = dfield->len;
+ }
+
+ vector->fields[n_fields].data = mem_heap_alloc(heap,
+ vector->fields[n_fields].len);
+
+ /* Copy data (from the end of field) to big rec vector */
+
+ ut_memcpy(vector->fields[n_fields].data,
+ ((byte*)dfield->data) + dfield->len
+ - vector->fields[n_fields].len,
+ vector->fields[n_fields].len);
+ dfield->len = dfield->len - vector->fields[n_fields].len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Set the extern field reference in dfield to zero */
+ memset(((byte*)dfield->data)
+ + dfield->len - BTR_EXTERN_FIELD_REF_SIZE,
+ 0, BTR_EXTERN_FIELD_REF_SIZE);
+ n_fields++;
+ }
+
+ vector->n_fields = n_fields;
+ return(vector);
+}
+
+/******************************************************************
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: entry whose data was put to vector */
+ big_rec_t* vector) /* in, own: big rec vector; it is
+ freed in this function */
+{
+ dfield_t* dfield;
+ ulint i;
+
+ for (i = 0; i < vector->n_fields; i++) {
+
+ dfield = dtuple_get_nth_field(entry,
+ vector->fields[i].field_no);
+ /* Copy data from big rec vector */
+
+ ut_memcpy(((byte*)dfield->data)
+ + dfield->len - BTR_EXTERN_FIELD_REF_SIZE,
+ vector->fields[i].data,
+ vector->fields[i].len);
+ dfield->len = dfield->len + vector->fields[i].len
+ - BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ mem_heap_free(vector->heap);
+}
+
+/******************************************************************
+Frees the memory in a big rec vector. */
+
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /* in, own: big rec vector; it is
+ freed in this function */
+{
+ mem_heap_free(vector->heap);
+}
+
#ifdef notdefined
/******************************************************************
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
index 6f201c7bce4..5c783627721 100644
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@@ -90,6 +90,9 @@ struct fil_node_struct {
is ignored) */
ulint n_pending;
/* count of pending i/o-ops on this file */
+ ibool is_modified; /* this is set to TRUE when we write
+ to the file and FALSE when we call fil_flush
+ for this file space */
UT_LIST_NODE_T(fil_node_t) chain;
/* link field for the file chain */
UT_LIST_NODE_T(fil_node_t) LRU;
@@ -301,6 +304,8 @@ fil_node_create(
node->size = size;
node->magic_n = FIL_NODE_MAGIC_N;
node->n_pending = 0;
+
+ node->is_modified = FALSE;
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
@@ -721,6 +726,47 @@ fil_space_get_size(
}
/***********************************************************************
+Checks if the pair space, page_no refers to an existing page in a
+tablespace file space. */
+
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ /* out: TRUE if the address is meaningful */
+ ulint id, /* in: space id */
+ ulint page_no)/* in: page number */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+ ulint size;
+ ibool ret;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space == NULL) {
+ ret = FALSE;
+ } else {
+ size = space->size;
+
+ if (page_no > size) {
+ ret = FALSE;
+ } else if (space->purpose != FIL_TABLESPACE) {
+ ret = FALSE;
+ } else {
+ ret = TRUE;
+ }
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(ret);
+}
+
+/***********************************************************************
Tries to reserve free extents in a file space. */
ibool
@@ -812,8 +858,14 @@ fil_node_prepare_for_io(
fil_node_close(last_node, system);
}
- node->handle = os_file_create(node->name, OS_FILE_OPEN,
- OS_FILE_AIO, &ret);
+ if (space->purpose == FIL_LOG) {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_LOG_FILE, &ret);
+ } else {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
+ }
+
ut_a(ret);
node->open = TRUE;
@@ -851,7 +903,8 @@ void
fil_node_complete_io(
/*=================*/
fil_node_t* node, /* in: file node */
- fil_system_t* system) /* in: file system */
+ fil_system_t* system, /* in: file system */
+ ulint type) /* in: OS_FILE_WRITE or ..._READ */
{
ut_ad(node);
ut_ad(system);
@@ -860,6 +913,10 @@ fil_node_complete_io(
node->n_pending--;
+ if (type != OS_FILE_READ) {
+ node->is_modified = TRUE;
+ }
+
if (node->n_pending == 0) {
/* The node must be put back to the LRU list */
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
@@ -1016,7 +1073,7 @@ loop:
mutex_enter(&(system->mutex));
- fil_node_complete_io(node, system);
+ fil_node_complete_io(node, system, type);
mutex_exit(&(system->mutex));
@@ -1090,12 +1147,14 @@ fil_aio_wait(
fil_node_t* fil_node;
fil_system_t* system = fil_system;
void* message;
+ ulint type;
ut_ad(fil_validate());
if (os_aio_use_native_aio) {
#ifdef WIN_ASYNC_IO
- ret = os_aio_windows_handle(segment, 0, &fil_node, &message);
+ ret = os_aio_windows_handle(segment, 0, &fil_node, &message,
+ &type);
#elif defined(POSIX_ASYNC_IO)
ret = os_aio_posix_handle(segment, &fil_node, &message);
#else
@@ -1103,14 +1162,14 @@ fil_aio_wait(
#endif
} else {
ret = os_aio_simulated_handle(segment, (void**) &fil_node,
- &message);
+ &message, &type);
}
ut_a(ret);
mutex_enter(&(system->mutex));
- fil_node_complete_io(fil_node, fil_system);
+ fil_node_complete_io(fil_node, fil_system, type);
mutex_exit(&(system->mutex));
@@ -1149,8 +1208,10 @@ fil_flush(
node = UT_LIST_GET_FIRST(space->chain);
while (node) {
- if (node->open) {
+ if (node->open && node->is_modified) {
file = node->handle;
+
+ node->is_modified = FALSE;
mutex_exit(&(system->mutex));
@@ -1159,9 +1220,11 @@ fil_flush(
handle is still open: we assume that the OS
will not crash or trap even if we pass a handle
to a closed file below in os_file_flush! */
+
+ /* printf("Flushing to file %s\n", node->name); */
os_file_flush(file);
-
+
mutex_enter(&(system->mutex));
}
diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c
index 101fb5f3ba0..ccc13f15fde 100644
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -3239,8 +3239,8 @@ fsp_validate(
ut_a(descr_count * FSP_EXTENT_SIZE == free_limit);
ut_a(n_used + n_full_frag_pages
- == n_used2 + (free_limit + XDES_DESCRIBED_PER_PAGE - 1)
- / XDES_DESCRIBED_PER_PAGE
+ == n_used2 + 2* ((free_limit + XDES_DESCRIBED_PER_PAGE - 1)
+ / XDES_DESCRIBED_PER_PAGE)
+ seg_inode_len_full + seg_inode_len_free);
ut_a(frag_n_used == n_used);
diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
index 171c6169927..3db20fb13ee 100644
--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@@ -1946,6 +1946,7 @@ ibuf_insert_low(
ulint page_no,/* in: page number where to insert */
que_thr_t* thr) /* in: query thread */
{
+ big_rec_t* dummy_big_rec;
ulint entry_size;
btr_pcur_t pcur;
btr_cur_t* cursor;
@@ -2101,7 +2102,8 @@ ibuf_insert_low(
if (mode == BTR_MODIFY_PREV) {
err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
- ibuf_entry, &ins_rec, thr,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, thr,
&mtr);
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
@@ -2121,7 +2123,8 @@ ibuf_insert_low(
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG,
cursor,
- ibuf_entry, &ins_rec, thr,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, thr,
&mtr);
if (err == DB_SUCCESS) {
/* Update the page max trx id field */
diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h
index f8a3000ca8a..bea85565125 100644
--- a/innobase/include/btr0btr.h
+++ b/innobase/include/btr0btr.h
@@ -357,6 +357,44 @@ btr_get_size(
/* out: number of pages */
dict_index_t* index, /* in: index */
ulint flag); /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+/******************************************************************
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents! */
+
+page_t*
+btr_page_alloc(
+/*===========*/
+ /* out: new allocated page, x-latched;
+ NULL if out of space */
+ dict_tree_t* tree, /* in: index tree */
+ ulint hint_page_no, /* in: hint of a good page */
+ byte file_direction, /* in: direction where a possible
+ page split is made */
+ ulint level, /* in: level where the page is placed
+ in the tree */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+
+void
+btr_page_free(
+/*==========*/
+ dict_tree_t* tree, /* in: index tree */
+ page_t* page, /* in: page to be freed, x-latched */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Frees a file page used in an index tree. Can be used also to BLOB
+external storage pages, because the page level 0 can be given as an
+argument. */
+
+void
+btr_page_free_low(
+/*==============*/
+ dict_tree_t* tree, /* in: index tree */
+ page_t* page, /* in: page to be freed, x-latched */
+ ulint level, /* in: page level */
+ mtr_t* mtr); /* in: mtr */
/*****************************************************************
Prints size info of a B-tree. */
diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h
index 4ce2177bfe8..ffae434a5d9 100644
--- a/innobase/include/btr0cur.h
+++ b/innobase/include/btr0cur.h
@@ -151,11 +151,14 @@ btr_cur_optimistic_insert(
ulint flags, /* in: undo logging and locking flags: if not
zero, the parameters index and thr should be
specified */
- btr_cur_t* cursor, /* in: cursor on page after which
- to insert; cursor stays valid */
+ btr_cur_t* cursor, /* in: cursor on page after which to insert;
+ cursor stays valid */
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
@@ -169,13 +172,19 @@ btr_cur_pessimistic_insert(
/*=======================*/
/* out: DB_SUCCESS or error number */
ulint flags, /* in: undo logging and locking flags: if not
- zero, the parameters index and thr should be
- specified */
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
btr_cur_t* cursor, /* in: cursor after which to insert;
- cursor does not stay valid */
+ cursor stays valid */
dtuple_t* entry, /* in: entry to insert */
rec_t** rec, /* out: pointer to inserted record if
succeed */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
que_thr_t* thr, /* in: query thread or NULL */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
@@ -228,8 +237,9 @@ btr_cur_pessimistic_update(
/* out: DB_SUCCESS or error code */
ulint flags, /* in: undo logging, locking, and rollback
flags */
- btr_cur_t* cursor, /* in: cursor on the record to update;
- cursor does not stay valid */
+ btr_cur_t* cursor, /* in: cursor on the record to update */
+ big_rec_t** big_rec,/* out: big rec vector whose fields have to
+ be stored externally by the caller, or NULL */
upd_t* update, /* in: update vector; this is allowed also
contain trx id and roll ptr fields, but
the values in update vector have no effect */
@@ -407,6 +417,92 @@ btr_estimate_number_of_different_key_vals(
/*======================================*/
/* out: estimated number of key values */
dict_index_t* index); /* in: index */
+/***********************************************************************
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The fields are stored on pages allocated from leaf node
+file segment of the index tree. */
+
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+ /* out: DB_SUCCESS or error */
+ dict_index_t* index, /* in: index of rec; the index tree
+ MUST be X-latched */
+ rec_t* rec, /* in: record */
+ big_rec_t* big_rec_vec, /* in: vector containing fields
+ to be stored externally */
+ mtr_t* local_mtr); /* in: mtr containing the latch to
+ rec and to the tree */
+/***********************************************************************
+Frees the space in an externally stored field to the file space
+management. */
+
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /* in: index of the data, the index
+ tree MUST be X-latched */
+ byte* data, /* in: internally stored data
+ + reference to the externally
+ stored part */
+ ulint local_len, /* in: length of data */
+ mtr_t* local_mtr); /* in: mtr containing the latch to
+ data an an X-latch to the index
+ tree */
+/***************************************************************
+Frees the externally stored fields for a record. */
+
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /* in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /* in: record */
+ mtr_t* mtr); /* in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+/***********************************************************************
+Copies an externally stored field of a record to mem heap. */
+
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+ /* out: the field copied to heap */
+ rec_t* rec, /* in: record */
+ ulint no, /* in: field number */
+ ulint* len, /* out: length of the field */
+ mem_heap_t* heap); /* in: mem heap */
+/***********************************************************************
+Copies an externally stored field of a record to mem heap. Parameter
+data contains a pointer to 'internally' stored part of the field:
+possibly some data, and the reference to the externally stored part in
+the last 20 bytes of data. */
+
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+ /* out: the whole field copied to heap */
+ ulint* len, /* out: length of the whole field */
+ byte* data, /* in: 'internally' stored part of the
+ field containing also the reference to
+ the external part */
+ ulint local_len,/* in: length of data */
+ mem_heap_t* heap); /* in: mem heap */
+/***********************************************************************
+Stores the positions of the fields marked as extern storage in the update
+vector, and also those fields who are marked as extern storage in rec
+and not mentioned in updated fields. We use this function to remember
+which fields we must mark as extern storage in a record inserted for an
+update. */
+
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+ /* out: number of values stored in ext_vect */
+ ulint* ext_vect, /* in: array of ulints, must be preallocated
+ to have place for all fields in rec */
+ rec_t* rec, /* in: record */
+ upd_t* update); /* in: update vector */
/*######################################################################*/
@@ -516,6 +612,19 @@ and sleep this many microseconds in between */
#define BTR_CUR_RETRY_DELETE_N_TIMES 100
#define BTR_CUR_RETRY_SLEEP_TIME 50000
+/* The reference in a field of which data is stored on a different page */
+/*--------------------------------------*/
+#define BTR_EXTERN_SPACE_ID 0 /* space id where stored */
+#define BTR_EXTERN_PAGE_NO 4 /* page no where stored */
+#define BTR_EXTERN_OFFSET 8 /* offset of BLOB header
+ on that page */
+#define BTR_EXTERN_LEN 12 /* 8 bytes containing the
+ length of the externally
+ stored part of the BLOB */
+/*--------------------------------------*/
+#define BTR_EXTERN_FIELD_REF_SIZE 20
+
+
extern ulint btr_cur_n_non_sea;
#ifndef UNIV_NONINL
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
index 7f3e20a4505..8b22561adf8 100644
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -378,6 +378,14 @@ buf_calc_page_checksum(
/*===================*/
/* out: checksum */
byte* page); /* in: buffer page */
+/************************************************************************
+Checks if a page is corrupt. */
+
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ /* out: TRUE if corrupted */
+ byte* read_buf); /* in: a database page */
/**************************************************************************
Gets the page number of a pointer pointing within a buffer frame containing
a file page. */
diff --git a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h
index 9317950904f..cb1c0965a65 100644
--- a/innobase/include/buf0flu.h
+++ b/innobase/include/buf0flu.h
@@ -101,7 +101,7 @@ make sure that a read-ahead batch can be read efficiently in a single
sweep). */
#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA)
-#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4)
+#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100)
#ifndef UNIV_NONINL
#include "buf0flu.ic"
diff --git a/innobase/include/data0data.h b/innobase/include/data0data.h
index d7f0986b0b6..f695e0989a5 100644
--- a/innobase/include/data0data.h
+++ b/innobase/include/data0data.h
@@ -14,6 +14,9 @@ Created 5/30/1994 Heikki Tuuri
#include "data0types.h"
#include "data0type.h"
#include "mem0mem.h"
+#include "dict0types.h"
+
+typedef struct big_rec_struct big_rec_t;
/* Some non-inlined functions used in the MySQL interface: */
void
@@ -312,6 +315,41 @@ dtuple_sprintf(
char* buf, /* in: print buffer */
ulint buf_len,/* in: buf length in bytes */
dtuple_t* tuple); /* in: tuple */
+/******************************************************************
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index. */
+
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ /* out, own: created big record vector,
+ NULL if we are not able to shorten
+ the entry enough, i.e., if there are
+ too many short fields in entry */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry); /* in: index entry */
+/******************************************************************
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: entry whose data was put to vector */
+ big_rec_t* vector);/* in, own: big rec vector; it is
+ freed in this function */
+/******************************************************************
+Frees the memory in a big rec vector. */
+
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector); /* in, own: big rec vector; it is
+ freed in this function */
/***************************************************************
Generates a random tuple. */
@@ -396,7 +434,7 @@ dtuple_gen_search_tuple_TPC_C(
/* Structure for an SQL data field */
struct dfield_struct{
void* data; /* pointer to data */
- ulint len; /* data length; UNIV_SQL_NULL if SQL null */
+ ulint len; /* data length; UNIV_SQL_NULL if SQL null; */
dtype_t type; /* type of data */
ulint col_no; /* when building index entries, the column
number can be stored here */
@@ -423,6 +461,24 @@ struct dtuple_struct {
};
#define DATA_TUPLE_MAGIC_N 65478679
+/* A slot for a field in a big rec vector */
+
+typedef struct big_rec_field_struct big_rec_field_t;
+struct big_rec_field_struct {
+ ulint field_no; /* field number in record */
+ ulint len; /* stored data len */
+ byte* data; /* stored data */
+};
+
+/* Storage format for overflow data in a big record, that is, a record
+which needs external storage of data fields */
+
+struct big_rec_struct {
+ mem_heap_t* heap; /* memory heap from which allocated */
+ ulint n_fields; /* number of stored fields */
+ big_rec_field_t* fields; /* stored fields */
+};
+
#ifndef UNIV_NONINL
#include "data0data.ic"
#endif
diff --git a/innobase/include/data0data.ic b/innobase/include/data0data.ic
index 27b5552d338..b886ad6c69c 100644
--- a/innobase/include/data0data.ic
+++ b/innobase/include/data0data.ic
@@ -307,12 +307,13 @@ dtuple_create(
/**************************************************************
The following function returns the sum of data lengths of a tuple. The space
-occupied by the field structs or the tuple struct is not counted. */
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field. */
UNIV_INLINE
ulint
dtuple_get_data_size(
/*=================*/
- /* out: sum of data lens */
+ /* out: sum of data lengths */
dtuple_t* tuple) /* in: typed data tuple */
{
dfield_t* field;
@@ -382,7 +383,7 @@ dtuple_datas_are_equal(
field2 = dtuple_get_nth_field(tuple2, i);
data2 = (byte*) dfield_get_data(field2);
- len2 = dfield_get_len(field2);
+ len2 = dfield_get_len(field2);
if (len1 != len2) {
diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h
index be9cd42b7be..74ecbc8bba2 100644
--- a/innobase/include/dict0mem.h
+++ b/innobase/include/dict0mem.h
@@ -143,7 +143,7 @@ struct dict_col_struct{
ulint clust_pos;/* position of the column in the
clustered index */
ulint ord_part;/* count of how many times this column
- appears in an ordering fields of an index */
+ appears in ordering fields of an index */
char* name; /* name */
dtype_t type; /* data type */
dict_table_t* table; /* back pointer to table of this column */
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
index 9905b5a2c3c..bfc322270fc 100644
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@@ -196,6 +196,16 @@ fil_space_get_size(
/* out: space size */
ulint id); /* in: space id */
/***********************************************************************
+Checks if the pair space, page_no refers to an existing page in a
+tablespace file space. */
+
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ /* out: TRUE if the address is meaningful */
+ ulint id, /* in: space id */
+ ulint page_no);/* in: page number */
+/***********************************************************************
Appends a new file to the chain of files of a space.
File must be closed. */
diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h
index f1be4de4d40..e7f9eab330b 100644
--- a/innobase/include/fsp0fsp.h
+++ b/innobase/include/fsp0fsp.h
@@ -70,7 +70,7 @@ page_t*
fseg_create(
/*========*/
/* out: the page where the segment header is placed,
- x-latched, FIL_NULL if could not create segment
+ x-latched, NULL if could not create segment
because of lack of space */
ulint space, /* in: space id */
ulint page, /* in: page where the segment header is placed: if
diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic
index 176f3415281..1d6badd035b 100644
--- a/innobase/include/mach0data.ic
+++ b/innobase/include/mach0data.ic
@@ -115,7 +115,7 @@ mach_write_to_4(
{
ut_ad(b);
-#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
+#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
/* We do not use this even on Intel, because unaligned accesses may
be slow */
@@ -143,7 +143,7 @@ mach_read_from_4(
/* out: ulint integer */
byte* b) /* in: pointer to four bytes */
{
-#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
+#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
/* We do not use this even on Intel, because unaligned accesses may
be slow */
diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h
index c093cb92ca9..75bbbba549f 100644
--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@@ -59,6 +59,10 @@ log. */
#define OS_FILE_AIO 61
#define OS_FILE_NORMAL 62
+/* Types for file create */
+#define OS_DATA_FILE 100
+#define OS_LOG_FILE 101
+
/* Error codes from os_file_get_last_error */
#define OS_FILE_NOT_FOUND 71
#define OS_FILE_DISK_FULL 72
@@ -125,6 +129,7 @@ os_file_create(
if a new file is created or an old overwritten */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file */
+ ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success);/* out: TRUE if succeed, FALSE if error */
/***************************************************************************
Closes a file handle. In case of error, error number can be retrieved with
@@ -263,6 +268,13 @@ os_aio(
operation); if mode is OS_AIO_SYNC, these
are ignored */
void* message2);
+/****************************************************************************
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+
+void
+os_aio_wait_until_no_pending_writes(void);
+/*=====================================*/
/**************************************************************************
Wakes up simulated aio i/o-handler threads if they have something to do. */
@@ -298,7 +310,8 @@ os_aio_windows_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
- void** message2);
+ void** message2,
+ ulint* type); /* out: OS_FILE_WRITE or ..._READ */
#endif
#ifdef POSIX_ASYNC_IO
/**************************************************************************
@@ -335,7 +348,8 @@ os_aio_simulated_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
- void** message2);
+ void** message2,
+ ulint* type); /* out: OS_FILE_WRITE or ..._READ */
/**************************************************************************
Validates the consistency of the aio system. */
diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h
index 77b9ef9edc8..10c428cb9ca 100644
--- a/innobase/include/rem0cmp.h
+++ b/innobase/include/rem0cmp.h
@@ -1,7 +1,7 @@
/***********************************************************************
Comparison services for records
-(c) 1994-1996 Innobase Oy
+(c) 1994-2001 Innobase Oy
Created 7/1/1994 Heikki Tuuri
************************************************************************/
@@ -31,14 +31,18 @@ This function is used to compare a data tuple to a physical record.
Only dtuple->n_fields_cmp first fields are taken into account for
the the data tuple! If we denote by n = n_fields_cmp, then rec must
have either m >= n fields, or it must differ from dtuple in some of
-the m fields rec has. */
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made. */
int
cmp_dtuple_rec_with_match(
/*======================*/
/* out: 1, 0, -1, if dtuple is greater, equal,
less than rec, respectively, when only the
- common first fields are compared */
+ common first fields are compared, or
+ until the first externally stored field in
+ rec */
dtuple_t* dtuple, /* in: data tuple */
rec_t* rec, /* in: physical record which differs from
dtuple in some of the common fields, or which
@@ -89,7 +93,8 @@ cmp_dtuple_rec_prefix_equal(
fields in dtuple */
/*****************************************************************
This function is used to compare two physical records. Only the common
-first fields are compared. */
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned. */
int
cmp_rec_rec_with_match(
diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h
index 62c0aa14519..12e3a8b39d6 100644
--- a/innobase/include/rem0rec.h
+++ b/innobase/include/rem0rec.h
@@ -12,6 +12,7 @@ Created 5/30/1994 Heikki Tuuri
#include "univ.i"
#include "data0data.h"
#include "rem0types.h"
+#include "mtr0types.h"
/* Maximum values for various fields (for non-blob tuples) */
#define REC_MAX_N_FIELDS (1024 - 1)
@@ -162,6 +163,49 @@ rec_get_nth_field_size(
/* out: field size in bytes */
rec_t* rec, /* in: record */
ulint n); /* in: index of the field */
+/***************************************************************
+Gets the value of the ith field extern storage bit. If it is TRUE
+it means that the field is stored on another page. */
+UNIV_INLINE
+ibool
+rec_get_nth_field_extern_bit(
+/*=========================*/
+ /* in: TRUE or FALSE */
+ rec_t* rec, /* in: record */
+ ulint i); /* in: ith field */
+/**********************************************************
+Returns TRUE if the extern bit is set in any of the fields
+of rec. */
+UNIV_INLINE
+ibool
+rec_contains_externally_stored_field(
+/*=================================*/
+ /* out: TRUE if a field is stored externally */
+ rec_t* rec); /* in: record */
+/***************************************************************
+Sets the value of the ith field extern storage bit. */
+
+void
+rec_set_nth_field_extern_bit(
+/*=========================*/
+ rec_t* rec, /* in: record */
+ ulint i, /* in: ith field */
+ ibool val, /* in: value to set */
+ mtr_t* mtr); /* in: mtr holding an X-latch to the page where
+ rec is, or NULL; in the NULL case we do not
+ write to log about the change */
+/***************************************************************
+Sets TRUE the extern storage bits of fields mentioned in an array. */
+
+void
+rec_set_field_extern_bits(
+/*======================*/
+ rec_t* rec, /* in: record */
+ ulint* vec, /* in: array of field numbers */
+ ulint n_fields, /* in: number of fields numbers */
+ mtr_t* mtr); /* in: mtr holding an X-latch to the page
+ where rec is, or NULL; in the NULL case we
+ do not write to log about the change */
/****************************************************************
The following function is used to get a copy of the nth
data field in the record to a buffer. */
@@ -350,6 +394,15 @@ rec_sprintf(
#define REC_INFO_BITS 6 /* This is single byte bit-field */
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7F
+#define REC_2BYTE_OFFS_LIMIT 0x7FFF
+
+/* The data size of record must be smaller than this because we reserve
+two upmost bits in a two byte offset for special purposes */
+#define REC_MAX_DATA_SIZE (16 * 1024)
+
#ifndef UNIV_NONINL
#include "rem0rec.ic"
#endif
diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic
index c63b25374dd..1e9ecb47e2e 100644
--- a/innobase/include/rem0rec.ic
+++ b/innobase/include/rem0rec.ic
@@ -25,12 +25,6 @@ significant bytes and bits are written below less significant.
4 bits info bits
*/
-
-/* Maximum lengths for the data in a physical record if the offsets
-are given as one byte (resp. two byte) format. */
-#define REC_1BYTE_OFFS_LIMIT 0x7F
-#define REC_2BYTE_OFFS_LIMIT 0x7FFF
-
/* We list the byte offsets from the origin of the record, the mask,
and the shift needed to obtain each bit-field of the record. */
@@ -66,6 +60,11 @@ one-byte and two-byte offsets */
#define REC_1BYTE_SQL_NULL_MASK 0x80
#define REC_2BYTE_SQL_NULL_MASK 0x8000
+/* In a 2-byte offset the second most significant bit denotes
+a field stored to another page: */
+
+#define REC_2BYTE_EXTERN_MASK 0x4000
+
/***************************************************************
Sets the value of the ith field SQL null bit. */
@@ -489,7 +488,7 @@ ulint
rec_2_get_field_end_info(
/*=====================*/
/* out: offset of the start of the field, SQL null
- flag ORed */
+ flag and extern storage flag ORed */
rec_t* rec, /* in: record */
ulint n) /* in: field index */
{
@@ -499,6 +498,63 @@ rec_2_get_field_end_info(
return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2)));
}
+/***************************************************************
+Gets the value of the ith field extern storage bit. If it is TRUE
+it means that the field is stored on another page. */
+UNIV_INLINE
+ibool
+rec_get_nth_field_extern_bit(
+/*=========================*/
+ /* in: TRUE or FALSE */
+ rec_t* rec, /* in: record */
+ ulint i) /* in: ith field */
+{
+ ulint info;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(FALSE);
+ }
+
+ info = rec_2_get_field_end_info(rec, i);
+
+ if (info & REC_2BYTE_EXTERN_MASK) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************
+Returns TRUE if the extern bit is set in any of the fields
+of rec. */
+UNIV_INLINE
+ibool
+rec_contains_externally_stored_field(
+/*=================================*/
+ /* out: TRUE if a field is stored externally */
+ rec_t* rec) /* in: record */
+{
+ ulint n;
+ ulint i;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(FALSE);
+ }
+
+ n = rec_get_n_fields(rec);
+
+ for (i = 0; i < n; i++) {
+ if (rec_get_nth_field_extern_bit(rec, i)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
/**********************************************************
Returns the offset of n - 1th field end if the record is stored in the 1-byte
offsets form. If the field is SQL null, the flag is ORed in the returned
@@ -616,7 +672,7 @@ rec_2_get_field_start_offs(
}
return(rec_2_get_prev_field_end_info(rec, n)
- & ~REC_2BYTE_SQL_NULL_MASK);
+ & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
}
/**********************************************************
diff --git a/innobase/include/row0ins.h b/innobase/include/row0ins.h
index 94b0e8dec37..612b9e8d73a 100644
--- a/innobase/include/row0ins.h
+++ b/innobase/include/row0ins.h
@@ -56,6 +56,9 @@ row_ins_index_entry_low(
pessimistic descent down the index tree */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr); /* in: query thread */
/*******************************************************************
Inserts an index entry to index. Tries first optimistic, then pessimistic
@@ -70,6 +73,9 @@ row_ins_index_entry(
DB_DUPLICATE_KEY, or some other error code */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr); /* in: query thread */
/***************************************************************
Inserts a row to a table. */
diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h
index 554da2c035c..31f9e15cddc 100644
--- a/innobase/include/row0mysql.h
+++ b/innobase/include/row0mysql.h
@@ -189,7 +189,9 @@ row_update_for_mysql(
row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
handle */
/*************************************************************************
-Does a table creation operation for MySQL. */
+Does a table creation operation for MySQL. If the name of the created
+table ends to characters INNODB_MONITOR, then this also starts
+printing of monitor output by the master thread. */
int
row_create_table_for_mysql(
@@ -209,7 +211,9 @@ row_create_index_for_mysql(
dict_index_t* index, /* in: index defintion */
trx_t* trx); /* in: transaction handle */
/*************************************************************************
-Drops a table for MySQL. */
+Drops a table for MySQL. If the name of the dropped table ends to
+characters INNODB_MONITOR, then this also stops printing of monitor
+output by the master thread. */
int
row_drop_table_for_mysql(
diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h
index fb1e1b01ee3..09a79e19fd7 100644
--- a/innobase/include/row0row.h
+++ b/innobase/include/row0row.h
@@ -250,6 +250,7 @@ row_search_index_entry(
#define ROW_COPY_DATA 1
#define ROW_COPY_POINTERS 2
+#define ROW_COPY_ALSO_EXTERNALS 3
/* The allowed latching order of index records is the following:
(1) a secondary index record ->
diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h
index 3046345f446..9bb73726b29 100644
--- a/innobase/include/row0upd.h
+++ b/innobase/include/row0upd.h
@@ -147,6 +147,9 @@ row_upd_build_difference(
fields, excluding roll ptr and trx id */
dict_index_t* index, /* in: clustered index */
dtuple_t* entry, /* in: entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
rec_t* rec, /* in: clustered index record */
mem_heap_t* heap); /* in: memory heap from which allocated */
/***************************************************************
@@ -262,6 +265,9 @@ struct upd_field_struct{
constants in the symbol table of the
query graph */
dfield_t new_val; /* new value for the column */
+ ibool extern_storage; /* this is set to TRUE if dfield
+ actually contains a reference to
+ an externally stored field */
};
/* Update vector structure */
@@ -318,6 +324,10 @@ struct upd_node_struct{
dtuple_t* row; /* NULL, or a copy (also fields copied to
heap) of the row to update; this must be reset
to NULL after a successful update */
+ ulint* ext_vec;/* array describing which fields are stored
+ externally in the clustered index record of
+ row */
+ ulint n_ext_vec;/* number of fields in ext_vec */
mem_heap_t* heap; /* memory heap used as auxiliary storage for
row; this must be emptied after a successful
update if node->row != NULL */
@@ -349,7 +359,7 @@ struct upd_node_struct{
looked at and updated if an ordering
field changed */
-/* Compilation info flags: these must fit within one byte */
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
changed in the update and no ordering
field of the clustered index */
diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic
index b1b10bef0e8..b785e52caa0 100644
--- a/innobase/include/row0upd.ic
+++ b/innobase/include/row0upd.ic
@@ -23,6 +23,7 @@ upd_create(
mem_heap_t* heap) /* in: heap from which memory allocated */
{
upd_t* update;
+ ulint i;
update = mem_heap_alloc(heap, sizeof(upd_t));
@@ -30,6 +31,10 @@ upd_create(
update->n_fields = n;
update->fields = mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+ for (i = 0; i < n; i++) {
+ update->fields[i].extern_storage = 0;
+ }
+
return(update);
}
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
index f80abda19c6..e635964e5ec 100644
--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@@ -27,6 +27,9 @@ extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
extern ulint* srv_data_file_is_raw_partition;
+#define SRV_NEW_RAW 1
+#define SRV_OLD_RAW 2
+
extern char** srv_log_group_home_dirs;
extern ulint srv_n_log_groups;
@@ -52,10 +55,14 @@ extern ulint srv_lock_wait_timeout;
extern char* srv_unix_file_flush_method_str;
extern ulint srv_unix_file_flush_method;
+extern ibool srv_use_doublewrite_buf;
+
extern ibool srv_set_thread_priorities;
extern int srv_query_thread_priority;
/*-------------------------------------------*/
+
+extern ibool srv_print_innodb_monitor;
extern ulint srv_n_spin_wait_rounds;
extern ulint srv_spin_wait_delay;
extern ibool srv_priority_boost;
@@ -104,26 +111,13 @@ typedef struct srv_sys_struct srv_sys_t;
/* The server system */
extern srv_sys_t* srv_sys;
-/* Alternatives for file flush option in Unix; see the InnoDB manual about
+/* Alternatives for fiel flush option in Unix; see the InnoDB manual about
what these mean */
#define SRV_UNIX_FDATASYNC 1
#define SRV_UNIX_O_DSYNC 2
#define SRV_UNIX_LITTLESYNC 3
#define SRV_UNIX_NOSYNC 4
-/* Raw partition flags */
-#define SRV_OLD_RAW 1
-#define SRV_NEW_RAW 2
-
-void
-srv_mysql_thread_release(void);
-/*==========================*/
-os_event_t
-srv_mysql_thread_event_get(void);
-void
-srv_mysql_thread_slot_free(
-/*==========================*/
- os_event_t event);
/*************************************************************************
Boots Innobase server. */
diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
index 4b12dd3c86d..cb86b2b815c 100644
--- a/innobase/include/sync0sync.h
+++ b/innobase/include/sync0sync.h
@@ -393,6 +393,7 @@ Memory pool mutex */
#define SYNC_RSEG_HEADER_NEW 591
#define SYNC_RSEG_HEADER 590
#define SYNC_TRX_UNDO_PAGE 570
+#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
/*------------------------------------- Insert buffer headers */
@@ -415,6 +416,7 @@ Memory pool mutex */
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
+#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_MEM_HASH 131
#define SYNC_MEM_POOL 130
diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h
index ea9e9f3fce5..edfc283d1b2 100644
--- a/innobase/include/trx0rec.h
+++ b/innobase/include/trx0rec.h
@@ -45,6 +45,14 @@ trx_undo_rec_get_cmpl_info(
/* out: compiler info */
trx_undo_rec_t* undo_rec); /* in: undo log record */
/**************************************************************************
+Returns TRUE if an undo log record contains an extern storage field. */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ /* out: TRUE if extern */
+ trx_undo_rec_t* undo_rec); /* in: undo log record */
+/**************************************************************************
Reads the undo log record number. */
UNIV_INLINE
dulint
@@ -65,6 +73,8 @@ trx_undo_rec_get_pars(
TRX_UNDO_INSERT_REC, ... */
ulint* cmpl_info, /* out: compiler info, relevant only
for update type records */
+ ibool* updated_extern, /* out: TRUE if we updated an
+ externally stored fild */
dulint* undo_no, /* out: undo log record number */
dulint* table_id); /* out: table id */
/***********************************************************************
@@ -272,7 +282,11 @@ record */
do not change */
#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by
this and ORed to the type above */
-
+#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl
+ to denote that we updated external
+ storage fields: used by purge to
+ free the external storage */
+
/* Operation type flags used in trx_undo_report_row_operation */
#define TRX_UNDO_INSERT_OP 1
#define TRX_UNDO_MODIFY_OP 2
diff --git a/innobase/include/trx0rec.ic b/innobase/include/trx0rec.ic
index f813a52ff9c..cd02ed9e04c 100644
--- a/innobase/include/trx0rec.ic
+++ b/innobase/include/trx0rec.ic
@@ -31,6 +31,23 @@ trx_undo_rec_get_cmpl_info(
}
/**************************************************************************
+Returns TRUE if an undo log record contains an extern storage field. */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ /* out: TRUE if extern */
+ trx_undo_rec_t* undo_rec) /* in: undo log record */
+{
+ if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************************
Reads the undo log record number. */
UNIV_INLINE
dulint
diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h
index d0506dd65b7..e26f7e19850 100644
--- a/innobase/include/trx0sys.h
+++ b/innobase/include/trx0sys.h
@@ -27,6 +27,23 @@ Created 3/26/1996 Heikki Tuuri
/* The transaction system */
extern trx_sys_t* trx_sys;
+/* Doublewrite system */
+extern trx_doublewrite_t* trx_doublewrite;
+
+/********************************************************************
+Creates the doublewrite buffer at a database start. The header of the
+doublewrite buffer is placed on the trx system header page. */
+
+void
+trx_sys_create_doublewrite_buf(void);
+/*================================*/
+/********************************************************************
+At a database startup uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+
+void
+trx_sys_doublewrite_restore_corrupt_pages(void);
+/*===========================================*/
/*******************************************************************
Checks if a page address is the trx sys header page. */
UNIV_INLINE
@@ -235,6 +252,59 @@ therefore 256 */
segment specification slots */
/*-------------------------------------------------------------*/
+/* The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG 0 /* fseg header of the fseg
+ containing the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
+ /* 4-byte magic number which
+ shows if we already have
+ created the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
+ /* page number of the
+ first page in the first
+ sequence of 64
+ (= FSP_EXTENT_SIZE) consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
+ /* page number of the
+ first page in the second
+ sequence of 64 consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /* we repeat the above 3
+ numbers so that if the trx
+ sys header is half-written
+ to disk, we still may be able
+ to recover the information */
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
+
+#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
+
+/* Doublewrite control struct */
+struct trx_doublewrite_struct{
+ mutex_t mutex; /* mutex protecting the first_free field and
+ write_buf */
+ ulint block1; /* the page number of the first
+ doublewrite block (64 pages) */
+ ulint block2; /* page number of the second block */
+ ulint first_free; /* first free position in write_buf measured
+ in units of UNIV_PAGE_SIZE */
+ byte* write_buf; /* write buffer used in writing to the
+ doublewrite buffer, aligned to an
+ address divisible by UNIV_PAGE_SIZE
+ (which is required by Windows aio) */
+ byte* write_buf_unaligned; /* pointer to write_buf, but unaligned */
+ buf_block_t**
+ buf_block_arr; /* array to store pointers to the buffer
+ blocks which have been cached to write_buf */
+};
+
/* The transaction system central memory data structure; protected by the
kernel mutex */
struct trx_sys_struct{
diff --git a/innobase/include/trx0types.h b/innobase/include/trx0types.h
index 02da1605077..b8befe7172f 100644
--- a/innobase/include/trx0types.h
+++ b/innobase/include/trx0types.h
@@ -15,6 +15,7 @@ Created 3/26/1996 Heikki Tuuri
/* Memory objects */
typedef struct trx_struct trx_t;
typedef struct trx_sys_struct trx_sys_t;
+typedef struct trx_doublewrite_struct trx_doublewrite_t;
typedef struct trx_sig_struct trx_sig_t;
typedef struct trx_rseg_struct trx_rseg_t;
typedef struct trx_undo_struct trx_undo_t;
diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h
index 82c21f756e6..7f0378c68d3 100644
--- a/innobase/include/trx0undo.h
+++ b/innobase/include/trx0undo.h
@@ -341,7 +341,9 @@ struct trx_undo_struct{
have delete marked records, because of
a delete of a row or an update of an
indexed field; purge is then
- necessary. */
+ necessary; also TRUE if the transaction
+ has updated an externally stored
+ field */
dulint trx_id; /* id of the trx assigned to the undo
log */
ibool dict_operation; /* TRUE if a dict operation trx */
diff --git a/innobase/include/univ.i b/innobase/include/univ.i
index 73bf48b1bc0..6ffbb1b8fef 100644
--- a/innobase/include/univ.i
+++ b/innobase/include/univ.i
@@ -9,11 +9,12 @@ Created 1/20/1994 Heikki Tuuri
#ifndef univ_i
#define univ_i
-#undef UNIV_INTEL_X86
-
-#if (defined(_WIN32) || defined(_WIN64)) && !defined(MYSQL_SERVER)
+#if (defined(_WIN32) || defined(_WIN64))
#define __WIN__
+
+#ifndef MYSQL_SERVER
#include <windows.h>
+#endif
/* If you want to check for errors with compiler level -W4,
comment out the above include of windows.h and let the following defines
@@ -40,10 +41,8 @@ subdirectory of 'mysql'. */
#include <global.h>
#include <my_pthread.h>
-#ifndef __WIN__
/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
#include <sys/stat.h>
-#endif
#undef PACKAGE
#undef VERSION
@@ -63,19 +62,21 @@ subdirectory of 'mysql'. */
/* DEBUG VERSION CONTROL
===================== */
+
+/*
+#define UNIV_SYNC_DEBUG
+*/
+
/* Make a non-inline debug version */
/*
#define UNIV_DEBUG
#define UNIV_MEM_DEBUG
-#define UNIV_SYNC_DEBUG
#define UNIV_SEARCH_DEBUG
#define UNIV_IBUF_DEBUG
#define UNIV_SYNC_PERF_STAT
#define UNIV_SEARCH_PERF_STAT
-
-#define UNIV_DEBUG_FILE_ACCESSES
*/
#define UNIV_LIGHT_MEM_DEBUG
@@ -192,6 +193,13 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
has the SQL NULL as its value. */
#define UNIV_SQL_NULL ULINT_UNDEFINED
+/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE)
+
/* The following definition of __FILE__ removes compiler warnings
associated with const char* / char* mismatches with __FILE__ */
diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h
index 657d1bf95b2..fc5d493ca5e 100644
--- a/innobase/include/ut0dbg.h
+++ b/innobase/include/ut0dbg.h
@@ -41,7 +41,7 @@ extern ulint* ut_dbg_null_ptr;
}\
if (ut_dbg_stop_threads) {\
fprintf(stderr,\
- "Innobase: Thread %lu stopped in file %s line %lu\n",\
+ "InnoDB: Thread %lu stopped in file %s line %lu\n",\
os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\
os_thread_sleep(1000000000);\
}\
@@ -50,19 +50,17 @@ extern ulint* ut_dbg_null_ptr;
#define ut_error {\
ulint dbg_i;\
fprintf(stderr,\
- "Innobase: Assertion failure in thread %lu in file %s line %lu\n",\
+ "InnoDB: Assertion failure in thread %lu in file %s line %lu\n",\
os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\
fprintf(stderr,\
- "Innobase: we intentionally generate a memory trap.\n");\
+ "InnoDB: We intentionally generate a memory trap.\n");\
fprintf(stderr,\
- "Innobase: Send a bug report to mysql@lists.mysql.com\n");\
+ "InnoDB: Send a detailed bug report to mysql@lists.mysql.com\n");\
ut_dbg_stop_threads = TRUE;\
dbg_i = *(ut_dbg_null_ptr);\
printf("%lu", dbg_i);\
}
-
-
#ifdef UNIV_DEBUG
#define ut_ad(EXPR) ut_a(EXPR)
#define ut_d(EXPR) {EXPR;}
diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h
index 1e93a2b8a36..e1813e763bd 100644
--- a/innobase/include/ut0ut.h
+++ b/innobase/include/ut0ut.h
@@ -11,8 +11,7 @@ Created 1/20/1994 Heikki Tuuri
#include "univ.i"
#include <time.h>
-#include <m_ctype.h>
-
+#include <ctype.h>
typedef time_t ib_time_t;
diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c
index 79fb66459b2..5f8f538f392 100644
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@@ -3219,6 +3219,7 @@ lock_rec_print(
ulint space;
ulint page_no;
ulint i;
+ ulint count = 0;
mtr_t mtr;
ut_ad(mutex_own(&kernel_mutex));
@@ -3230,7 +3231,8 @@ lock_rec_print(
printf("\nRECORD LOCKS space id %lu page no %lu n bits %lu",
space, page_no, lock_rec_get_n_bits(lock));
- printf(" index %s trx id %lu %lu", (lock->index)->name,
+ printf(" table %s index %s trx id %lu %lu",
+ lock->index->table->name, lock->index->name,
(lock->trx)->id.high, (lock->trx)->id.low);
if (lock_get_mode(lock) == LOCK_S) {
@@ -3281,10 +3283,18 @@ lock_rec_print(
rec_print(page_find_rec_with_heap_no(page, i));
}
+ count++;
+
printf("\n");
}
- }
+ if (count >= 3) {
+ printf(
+ "3 LOCKS PRINTED FOR THIS TRX AND PAGE: SUPPRESSING FURTHER PRINTS\n");
+ goto end_prints;
+ }
+ }
+end_prints:
mtr_commit(&mtr);
}
@@ -3335,7 +3345,6 @@ lock_print_info(void)
lock_mutex_enter_kernel();
- printf("------------------------------------\n");
printf("LOCK INFO:\n");
printf("Number of locks in the record hash table %lu\n",
lock_get_n_rec_locks());
@@ -3352,7 +3361,7 @@ loop:
if (trx == NULL) {
lock_mutex_exit_kernel();
- lock_validate();
+ /* lock_validate(); */
return;
}
@@ -3360,6 +3369,19 @@ loop:
if (nth_lock == 0) {
printf("\nLOCKS FOR TRANSACTION ID %lu %lu\n", trx->id.high,
trx->id.low);
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+ printf(
+ "################# TRX IS WAITING FOR THE LOCK: ###\n");
+
+ if (lock_get_type(trx->wait_lock) == LOCK_REC) {
+ lock_rec_print(trx->wait_lock);
+ } else {
+ lock_table_print(trx->wait_lock);
+ }
+
+ printf(
+ "##################################################\n");
+ }
}
i = 0;
@@ -3409,6 +3431,16 @@ loop:
nth_lock++;
+ if (nth_lock >= 25) {
+ printf(
+ "25 LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n");
+
+ nth_trx++;
+ nth_lock = 0;
+
+ goto loop;
+ }
+
goto loop;
}
diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c
index 31cf595e59e..351ea7f2fd5 100644
--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -838,7 +838,9 @@ log_io_complete(
/* It was a checkpoint write */
group = (log_group_t*)((ulint)group - 1);
- if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
+ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
fil_flush(group->space_id);
}
@@ -847,7 +849,9 @@ log_io_complete(
return;
}
- if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
+ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
fil_flush(group->space_id);
}
@@ -1478,7 +1482,7 @@ log_checkpoint(
recv_apply_hashed_log_recs(TRUE);
}
- if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) {
+ if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
fil_flush_file_spaces(FIL_TABLESPACE);
}
@@ -1885,10 +1889,11 @@ loop:
fil_reserve_right_to_open();
file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
- &ret);
+ OS_DATA_FILE, &ret);
+
if (!ret && (open_mode == OS_FILE_CREATE)) {
file_handle = os_file_create(name, OS_FILE_OPEN,
- OS_FILE_AIO, &ret);
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
}
if (!ret) {
diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c
index d16085a2d6f..edab98fa39c 100644
--- a/innobase/log/log0recv.c
+++ b/innobase/log/log0recv.c
@@ -2234,7 +2234,8 @@ try_open_again:
fil_reserve_right_to_open();
- file_handle = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO, &ret);
+ file_handle = os_file_create(name, OS_FILE_OPEN,
+ OS_FILE_LOG, OS_FILE_AIO, &ret);
if (ret == FALSE) {
fil_release_right_to_open();
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index d3c6232031a..0525fd7b59a 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -10,6 +10,7 @@ Created 10/21/1995 Heikki Tuuri
#include "os0sync.h"
#include "ut0mem.h"
#include "srv0srv.h"
+#include "trx0sys.h"
#undef HAVE_FDATASYNC
@@ -74,9 +75,12 @@ typedef struct os_aio_array_struct os_aio_array_t;
struct os_aio_array_struct{
os_mutex_t mutex; /* the mutex protecting the aio array */
- os_event_t not_full; /* The event which is set to signaled
+ os_event_t not_full; /* The event which is set to the signaled
state when there is space in the aio
outside the ibuf segment */
+ os_event_t is_empty; /* The event which is set to the signaled
+ state when there are no pending i/os
+ in this array */
ulint n_slots; /* Total number of slots in the aio array.
This must be divisible by n_threads. */
ulint n_segments;/* Number of segments in the aio array of
@@ -254,6 +258,7 @@ os_file_create(
if a new is created or an old overwritten */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file */
+ ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success)/* out: TRUE if succeed, FALSE if error */
{
#ifdef __WIN__
@@ -347,11 +352,10 @@ try_again:
UT_NOT_USED(purpose);
- /* Currently use only O_SYNC because there may be a bug in
- Linux O_DSYNC! */
-
#ifdef O_SYNC
- if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+ if ((!srv_use_doublewrite_buf || type != OS_DATA_FILE)
+ && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
create_flag = create_flag | O_SYNC;
}
#endif
@@ -551,12 +555,6 @@ os_file_flush(
#else
int ret;
-#ifdef O_DSYNC
- if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
- return(TRUE);
- }
-#endif
-
#ifdef HAVE_FDATASYNC
ret = fdatasync(file);
#else
@@ -637,7 +635,8 @@ os_file_pwrite(
ret = pwrite(file, buf, n, offs);
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
- && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && !trx_doublewrite) {
/* Always do fsync to reduce the probability that when
the OS crashes, a database page is only partially
@@ -666,7 +665,8 @@ os_file_pwrite(
ret = write(file, buf, n);
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
- && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && !trx_doublewrite) {
/* Always do fsync to reduce the probability that when
the OS crashes, a database page is only partially
@@ -825,7 +825,9 @@ try_again:
/* Always do fsync to reduce the probability that when the OS crashes,
a database page is only partially physically written to disk. */
- ut_a(TRUE == os_file_flush(file));
+ if (!trx_doublewrite) {
+ ut_a(TRUE == os_file_flush(file));
+ }
os_mutex_exit(os_file_seek_mutexes[i]);
@@ -900,6 +902,10 @@ os_aio_array_create(
array->mutex = os_mutex_create(NULL);
array->not_full = os_event_create(NULL);
+ array->is_empty = os_event_create(NULL);
+
+ os_event_set(array->is_empty);
+
array->n_slots = n;
array->n_segments = n_segments;
array->n_reserved = 0;
@@ -999,6 +1005,17 @@ os_aio_init(
#endif
}
+/****************************************************************************
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+ os_event_wait(os_aio_write_array->is_empty);
+}
+
/**************************************************************************
Calculates segment number for a slot. */
static
@@ -1191,6 +1208,10 @@ loop:
array->n_reserved++;
+ if (array->n_reserved == 1) {
+ os_event_reset(array->is_empty);
+ }
+
if (array->n_reserved == array->n_slots) {
os_event_reset(array->not_full);
}
@@ -1264,6 +1285,10 @@ os_aio_array_free_slot(
os_event_set(array->not_full);
}
+ if (array->n_reserved == 0) {
+ os_event_set(array->is_empty);
+ }
+
#ifdef WIN_ASYNC_IO
os_event_reset(slot->control.hEvent);
#endif
@@ -1377,6 +1402,7 @@ os_aio(
DWORD len = n;
void* dummy_mess1;
void* dummy_mess2;
+ ulint dummy_type;
#endif
ulint err = 0;
ibool retry;
@@ -1489,8 +1515,9 @@ try_again:
use the same wait mechanism as for async i/o */
return(os_aio_windows_handle(ULINT_UNDEFINED,
- slot->pos,
- &dummy_mess1, &dummy_mess2));
+ slot->pos,
+ &dummy_mess1, &dummy_mess2,
+ &dummy_type));
}
return(TRUE);
@@ -1547,7 +1574,8 @@ os_aio_windows_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
- void** message2)
+ void** message2,
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
{
os_aio_array_t* array;
os_aio_slot_t* slot;
@@ -1592,10 +1620,12 @@ os_aio_windows_handle(
*message1 = slot->message1;
*message2 = slot->message2;
+ *type = slot->type;
+
if (ret && len == slot->len) {
ret_val = TRUE;
- if (slot->type == OS_FILE_WRITE) {
+ if (slot->type == OS_FILE_WRITE && !trx_doublewrite) {
ut_a(TRUE == os_file_flush(slot->file));
}
} else {
@@ -1679,7 +1709,7 @@ os_aio_posix_handle(
*message1 = slot->message1;
*message2 = slot->message2;
- if (slot->type == OS_FILE_WRITE) {
+ if (slot->type == OS_FILE_WRITE && !trx_doublewrite) {
ut_a(TRUE == os_file_flush(slot->file));
}
@@ -1709,7 +1739,8 @@ os_aio_simulated_handle(
the aio operation failed, these output
parameters are valid and can be used to
restart the operation, for example */
- void** message2)
+ void** message2,
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
{
os_aio_array_t* array;
ulint segment;
@@ -1906,6 +1937,8 @@ slot_io_done:
*message1 = slot->message1;
*message2 = slot->message2;
+ *type = slot->type;
+
os_mutex_exit(array->mutex);
os_aio_array_free_slot(array, slot);
@@ -1989,13 +2022,13 @@ os_aio_print(void)
os_aio_slot_t* slot;
ulint n_reserved;
ulint i;
-
+
+ printf("Pending normal aio reads:\n");
+
array = os_aio_read_array;
loop:
ut_a(array);
- printf("INFO OF AN AIO ARRAY\n");
-
os_mutex_enter(array->mutex);
ut_a(array->n_slots > 0);
@@ -2022,24 +2055,29 @@ loop:
os_mutex_exit(array->mutex);
if (array == os_aio_read_array) {
+ printf("Pending aio writes:\n");
+
array = os_aio_write_array;
goto loop;
}
if (array == os_aio_write_array) {
+ printf("Pending insert buffer aio reads:\n");
array = os_aio_ibuf_array;
goto loop;
}
if (array == os_aio_ibuf_array) {
+ printf("Pending log writes or reads:\n");
array = os_aio_log_array;
goto loop;
}
if (array == os_aio_log_array) {
+ printf("Pending synchronous reads or writes:\n");
array = os_aio_sync_array;
goto loop;
diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c
index e329b916b1b..0b233b4dd72 100644
--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@@ -1019,16 +1019,16 @@ page_cur_delete_rec(
page_cur_t* cursor, /* in: a page cursor */
mtr_t* mtr) /* in: mini-transaction handle */
{
+ page_dir_slot_t* cur_dir_slot;
+ page_dir_slot_t* prev_slot;
page_t* page;
rec_t* current_rec;
rec_t* prev_rec = NULL;
rec_t* next_rec;
ulint cur_slot_no;
- page_dir_slot_t* cur_dir_slot;
- page_dir_slot_t* prev_slot;
ulint cur_n_owned;
rec_t* rec;
-
+
ut_ad(cursor && mtr);
page = page_cur_get_page(cursor);
@@ -1037,7 +1037,7 @@ page_cur_delete_rec(
/* The record must not be the supremum or infimum record. */
ut_ad(current_rec != page_get_supremum_rec(page));
ut_ad(current_rec != page_get_infimum_rec(page));
-
+
/* Save to local variables some data associated with current_rec */
cur_slot_no = page_dir_find_owner_slot(current_rec);
cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c
index f6c51f3905a..4a298426476 100644
--- a/innobase/pars/pars0pars.c
+++ b/innobase/pars/pars0pars.c
@@ -2028,11 +2028,7 @@ pars_complete_graph_for_exec(
que_node_set_parent(node, thr);
- mutex_enter(&kernel_mutex);
-
trx->graph = NULL;
- mutex_exit(&kernel_mutex);
-
return(thr);
}
diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c
index 78f4e450269..cdf1f363946 100644
--- a/innobase/rem/rem0cmp.c
+++ b/innobase/rem/rem0cmp.c
@@ -295,14 +295,18 @@ This function is used to compare a data tuple to a physical record.
Only dtuple->n_fields_cmp first fields are taken into account for
the the data tuple! If we denote by n = n_fields_cmp, then rec must
have either m >= n fields, or it must differ from dtuple in some of
-the m fields rec has. */
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made. */
int
cmp_dtuple_rec_with_match(
/*======================*/
/* out: 1, 0, -1, if dtuple is greater, equal,
less than rec, respectively, when only the
- common first fields are compared */
+ common first fields are compared, or
+ until the first externally stored field in
+ rec */
dtuple_t* dtuple, /* in: data tuple */
rec_t* rec, /* in: physical record which differs from
dtuple in some of the common fields, or which
@@ -344,7 +348,8 @@ cmp_dtuple_rec_with_match(
ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple));
ut_ad(cur_field <= rec_get_n_fields(rec));
- /* Match fields in a loop; stop if we run out of fields in dtuple */
+ /* Match fields in a loop; stop if we run out of fields in dtuple
+ or find an externally stored field */
while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
@@ -357,7 +362,8 @@ cmp_dtuple_rec_with_match(
/* If we have matched yet 0 bytes, it may be that one or
both the fields are SQL null, or the record or dtuple may be
- the predefined minimum record */
+ the predefined minimum record, or the field is externally
+ stored */
if (cur_bytes == 0) {
if (cur_field == 0) {
@@ -384,6 +390,15 @@ cmp_dtuple_rec_with_match(
}
}
+ if (rec_get_nth_field_extern_bit(rec, cur_field)) {
+ /* We do not compare to an externally
+ stored field */
+
+ ret = 0;
+
+ goto order_resolved;
+ }
+
if (dtuple_f_len == UNIV_SQL_NULL
|| rec_f_len == UNIV_SQL_NULL) {
@@ -604,7 +619,8 @@ cmp_dtuple_rec_prefix_equal(
/*****************************************************************
This function is used to compare two physical records. Only the common
-first fields are compared. */
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned. */
int
cmp_rec_rec_with_match(
@@ -688,8 +704,18 @@ cmp_rec_rec_with_match(
goto order_resolved;
}
- }
+ }
+
+ if (rec_get_nth_field_extern_bit(rec1, cur_field)
+ || rec_get_nth_field_extern_bit(rec2, cur_field)) {
+ /* We do not compare to an externally
+ stored field */
+ ret = 0;
+
+ goto order_resolved;
+ }
+
if (rec1_f_len == UNIV_SQL_NULL
|| rec2_f_len == UNIV_SQL_NULL) {
@@ -812,7 +838,8 @@ order_resolved:
Used in debug checking of cmp_dtuple_... .
This function is used to compare a data tuple to a physical record. If
dtuple has n fields then rec must have either m >= n fields, or it must
-differ from dtuple in some of the m fields rec has. */
+differ from dtuple in some of the m fields rec has. If encounters an
+externally stored field, returns 0. */
static
int
cmp_debug_dtuple_rec_with_match(
@@ -882,6 +909,14 @@ cmp_debug_dtuple_rec_with_match(
rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len);
+ if (rec_get_nth_field_extern_bit(rec, cur_field)) {
+ /* We do not compare to an externally stored field */
+
+ ret = 0;
+
+ goto order_resolved;
+ }
+
ret = cmp_data_data(cur_type, dtuple_f_data, dtuple_f_len,
rec_f_data, rec_f_len);
if (ret != 0) {
diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c
index 9ddfe7a4b9a..88009f2f5c9 100644
--- a/innobase/rem/rem0rec.c
+++ b/innobase/rem/rem0rec.c
@@ -1,7 +1,7 @@
/************************************************************************
Record manager
-(c) 1994-1996 Innobase Oy
+(c) 1994-2001 Innobase Oy
Created 5/30/1994 Heikki Tuuri
*************************************************************************/
@@ -12,6 +12,9 @@ Created 5/30/1994 Heikki Tuuri
#include "rem0rec.ic"
#endif
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+
/* PHYSICAL RECORD
===============
@@ -21,7 +24,10 @@ found in index pages of the database, has the following format
represented on a higher text line):
| offset of the end of the last field of data, the most significant
- bit is set to 1 if and only if the field is SQL-null |
+ bit is set to 1 if and only if the field is SQL-null,
+ if the offset is 2-byte, then the second most significant
+ bit is set to 1 if the field is stored on another page:
+ mostly this will occur in the case of big BLOB fields |
...
| offset of the end of the first field of data + the SQL-null bit |
| 4 bits used to delete mark a record, and mark a predefined
@@ -122,7 +128,8 @@ rec_get_nth_field(
return(rec + os);
}
- next_os = next_os & ~REC_2BYTE_SQL_NULL_MASK;
+ next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
+ | REC_2BYTE_EXTERN_MASK);
}
*len = next_os - os;
@@ -170,6 +177,60 @@ rec_set_nth_field_null_bit(
rec_2_set_field_end_info(rec, i, info);
}
+/***************************************************************
+Sets the value of the ith field extern storage bit. */
+
+void
+rec_set_nth_field_extern_bit(
+/*=========================*/
+ rec_t* rec, /* in: record */
+ ulint i, /* in: ith field */
+ ibool val, /* in: value to set */
+ mtr_t* mtr) /* in: mtr holding an X-latch to the page where
+ rec is, or NULL; in the NULL case we do not
+ write to log about the change */
+{
+ ulint info;
+
+ ut_a(!rec_get_1byte_offs_flag(rec));
+ ut_a(i < rec_get_n_fields(rec));
+
+ info = rec_2_get_field_end_info(rec, i);
+
+ if (val) {
+ info = info | REC_2BYTE_EXTERN_MASK;
+ } else {
+ info = info & ~REC_2BYTE_EXTERN_MASK;
+ }
+
+ if (mtr) {
+ mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info,
+ MLOG_2BYTES, mtr);
+ } else {
+ rec_2_set_field_end_info(rec, i, info);
+ }
+}
+
+/***************************************************************
+Sets TRUE the extern storage bits of fields mentioned in an array. */
+
+void
+rec_set_field_extern_bits(
+/*======================*/
+ rec_t* rec, /* in: record */
+ ulint* vec, /* in: array of field numbers */
+ ulint n_fields, /* in: number of fields numbers */
+ mtr_t* mtr) /* in: mtr holding an X-latch to the page
+ where rec is, or NULL; in the NULL case we
+ do not write to log about the change */
+{
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr);
+ }
+}
+
/***************************************************************
Sets a record field to SQL null. The physical size of the field is not
changed. */
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
index e57622fd1c5..8542dcae326 100644
--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -234,7 +234,13 @@ row_ins_clust_index_entry_by_modify(
depending on whether mtr holds just a leaf
latch or also a tree latch */
btr_cur_t* cursor, /* in: B-tree cursor */
+ big_rec_t** big_rec,/* out: possible big rec vector of fields
+ which have to be stored externally by the
+ caller */
dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in: mtr */
{
@@ -243,8 +249,10 @@ row_ins_clust_index_entry_by_modify(
upd_t* update;
ulint err;
- ut_ad((cursor->index)->type & DICT_CLUSTERED);
+ ut_ad(cursor->index->type & DICT_CLUSTERED);
+ *big_rec = NULL;
+
rec = btr_cur_get_rec(cursor);
ut_ad(rec_get_deleted_flag(rec));
@@ -254,21 +262,21 @@ row_ins_clust_index_entry_by_modify(
/* Build an update vector containing all the fields to be modified;
NOTE that this vector may contain also system columns! */
- update = row_upd_build_difference(cursor->index, entry, rec, heap);
-
+ update = row_upd_build_difference(cursor->index, entry, ext_vec,
+ n_ext_vec, rec, heap);
if (mode == BTR_MODIFY_LEAF) {
/* Try optimistic updating of the record, keeping changes
within the page */
- err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
- mtr);
- if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) {
+ err = btr_cur_optimistic_update(0, cursor, update, 0, thr, mtr);
+
+ if (err == DB_OVERFLOW || err == DB_UNDERFLOW) {
err = DB_FAIL;
}
} else {
- ut_ad(mode == BTR_MODIFY_TREE);
- err = btr_cur_pessimistic_update(0, cursor, update, 0, thr,
- mtr);
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(0, cursor, big_rec, update,
+ 0, thr, mtr);
}
mem_heap_free(heap);
@@ -597,14 +605,18 @@ row_ins_index_entry_low(
pessimistic descent down the index tree */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr) /* in: query thread */
{
btr_cur_t cursor;
ulint modify;
- rec_t* dummy_rec;
+ rec_t* insert_rec;
rec_t* rec;
ulint err;
ulint n_unique;
+ big_rec_t* big_rec = NULL;
mtr_t mtr;
log_free_check();
@@ -682,24 +694,54 @@ row_ins_index_entry_low(
if (index->type & DICT_CLUSTERED) {
err = row_ins_clust_index_entry_by_modify(mode,
- &cursor, entry,
- thr, &mtr);
+ &cursor, &big_rec,
+ entry,
+ ext_vec, n_ext_vec,
+ thr, &mtr);
} else {
err = row_ins_sec_index_entry_by_modify(&cursor,
thr, &mtr);
}
- } else if (mode == BTR_MODIFY_LEAF) {
- err = btr_cur_optimistic_insert(0, &cursor, entry,
- &dummy_rec, thr, &mtr);
} else {
- ut_ad(mode == BTR_MODIFY_TREE);
- err = btr_cur_pessimistic_insert(0, &cursor, entry,
- &dummy_rec, thr, &mtr);
+ if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(0, &cursor, entry,
+ &insert_rec, &big_rec, thr, &mtr);
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_insert(0, &cursor, entry,
+ &insert_rec, &big_rec, thr, &mtr);
+ }
+
+ if (err == DB_SUCCESS) {
+ if (ext_vec) {
+ rec_set_field_extern_bits(insert_rec,
+ ext_vec, n_ext_vec, &mtr);
+ }
+ }
}
+
function_exit:
mtr_commit(&mtr);
+ if (big_rec) {
+ mtr_start(&mtr);
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &cursor, 0, &mtr);
+
+ err = btr_store_big_rec_extern_fields(index,
+ btr_cur_get_rec(&cursor),
+ big_rec, &mtr);
+ if (modify) {
+ dtuple_big_rec_free(big_rec);
+ } else {
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ }
+
+ mtr_commit(&mtr);
+ }
+
return(err);
}
@@ -716,14 +758,17 @@ row_ins_index_entry(
DB_DUPLICATE_KEY, or some other error code */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
que_thr_t* thr) /* in: query thread */
{
ulint err;
/* Try first optimistic descent to the B-tree */
- err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr);
-
+ err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+ ext_vec, n_ext_vec, thr);
if (err != DB_FAIL) {
return(err);
@@ -731,8 +776,8 @@ row_ins_index_entry(
/* Try then pessimistic descent to the B-tree */
- err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr);
-
+ err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+ ext_vec, n_ext_vec, thr);
return(err);
}
@@ -784,7 +829,7 @@ row_ins_index_entry_step(
ut_ad(dtuple_check_typed(node->entry));
- err = row_ins_index_entry(node->index, node->entry, thr);
+ err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr);
return(err);
}
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
index 8e1a584f667..9bbc45a5c9a 100644
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -625,7 +625,8 @@ row_update_for_mysql(
ut_ad(prebuilt && trx);
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
-
+ UT_NOT_USED(mysql_rec);
+
node = prebuilt->upd_node;
clust_index = dict_table_get_first_index(table);
@@ -777,7 +778,9 @@ row_get_mysql_key_number_for_index(
}
/*************************************************************************
-Does a table creation operation for MySQL. */
+Does a table creation operation for MySQL. If the name of the created
+table ends to characters INNODB_MONITOR, then this also starts
+printing of monitor output by the master thread. */
int
row_create_table_for_mysql(
@@ -789,6 +792,8 @@ row_create_table_for_mysql(
tab_node_t* node;
mem_heap_t* heap;
que_thr_t* thr;
+ ulint namelen;
+ ulint keywordlen;
ulint err;
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
@@ -833,6 +838,20 @@ row_create_table_for_mysql(
}
trx->error_state = DB_SUCCESS;
+ } else {
+ namelen = ut_strlen(table->name);
+
+ keywordlen = ut_strlen("innodb_monitor");
+
+ if (namelen >= keywordlen
+ && 0 == ut_memcmp(table->name + namelen - keywordlen,
+ "innodb_monitor", keywordlen)) {
+
+ /* Table name ends to characters innodb_monitor:
+ start monitor prints */
+
+ srv_print_innodb_monitor = TRUE;
+ }
}
mutex_exit(&(dict_sys->mutex));
@@ -900,7 +919,9 @@ row_create_index_for_mysql(
}
/*************************************************************************
-Drops a table for MySQL. */
+Drops a table for MySQL. If the name of the dropped table ends to
+characters INNODB_MONITOR, then this also stops printing of monitor
+output by the master thread. */
int
row_drop_table_for_mysql(
@@ -918,11 +939,26 @@ row_drop_table_for_mysql(
char* str1;
char* str2;
ulint len;
+ ulint namelen;
+ ulint keywordlen;
char buf[10000];
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_a(name != NULL);
+ namelen = ut_strlen(name);
+ keywordlen = ut_strlen("innodb_monitor");
+
+ if (namelen >= keywordlen
+ && 0 == ut_memcmp(name + namelen - keywordlen,
+ "innodb_monitor", keywordlen)) {
+
+ /* Table name ends to characters innodb_monitor:
+ stop monitor prints */
+
+ srv_print_innodb_monitor = FALSE;
+ }
+
/* We use the private SQL parser of Innobase to generate the
query graphs needed in deleting the dictionary data from system
tables in Innobase. Deleting a row from SYS_INDEXES table also
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
index 0a6fabe584c..ec880d3fe04 100644
--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@@ -347,20 +347,36 @@ row_purge_del_mark(
}
/***************************************************************
-Purges an update of an existing record. */
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
static
void
-row_purge_upd_exist(
-/*================*/
+row_purge_upd_exist_or_extern(
+/*==========================*/
purge_node_t* node, /* in: row purge node */
que_thr_t* thr) /* in: query thread */
{
mem_heap_t* heap;
dtuple_t* entry;
dict_index_t* index;
+ upd_field_t* ufield;
+ ibool is_insert;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ ulint internal_offset;
+ byte* data_field;
+ ulint data_field_len;
+ ulint i;
+ mtr_t mtr;
ut_ad(node && thr);
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ goto skip_secondaries;
+ }
+
heap = mem_heap_create(1024);
while (node->index != NULL) {
@@ -378,6 +394,53 @@ row_purge_upd_exist(
}
mem_heap_free(heap);
+
+skip_secondaries:
+ /* Free possible externally stored fields */
+ for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+ ufield = upd_get_nth_field(node->update, i);
+
+ if (ufield->extern_storage) {
+ /* We use the fact that new_val points to
+ node->undo_rec and get thus the offset of
+ dfield data inside the unod record. Then we
+ can calculate from node->roll_ptr the file
+ address of the new_val data */
+
+ internal_offset = ((byte*)ufield->new_val.data)
+ - node->undo_rec;
+
+ ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+ trx_undo_decode_roll_ptr(node->roll_ptr,
+ &is_insert, &rseg_id,
+ &page_no, &offset);
+ mtr_start(&mtr);
+
+ /* We have to acquire an X-latch to the clustered
+ index tree */
+
+ index = dict_table_get_first_index(node->table);
+
+ mtr_x_lock(dict_tree_get_lock(index->tree), &mtr);
+
+ /* We assume in purge of externally stored fields
+ that the space id of the undo log record is 0! */
+
+ data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr)
+ + offset + internal_offset;
+
+ buf_page_dbg_add_level(buf_frame_align(data_field),
+ SYNC_TRX_UNDO_PAGE);
+
+ data_field_len = ufield->new_val.len;
+
+ btr_free_externally_stored_field(index, data_field,
+ data_field_len, &mtr);
+ mtr_commit(&mtr);
+ }
+ }
}
/***************************************************************
@@ -388,6 +451,9 @@ row_purge_parse_undo_rec(
/*=====================*/
/* out: TRUE if purge operation required */
purge_node_t* node, /* in: row undo node */
+ ibool* updated_extern,
+ /* out: TRUE if an externally stored field
+ was updated */
que_thr_t* thr) /* in: query thread */
{
dict_index_t* clust_index;
@@ -403,10 +469,10 @@ row_purge_parse_undo_rec(
ut_ad(node && thr);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
- &undo_no, &table_id);
+ updated_extern, &undo_no, &table_id);
node->rec_type = type;
- if (type == TRX_UNDO_UPD_DEL_REC) {
+ if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
return(FALSE);
}
@@ -416,7 +482,7 @@ row_purge_parse_undo_rec(
node->table = NULL;
if (type == TRX_UNDO_UPD_EXIST_REC
- && cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
/* Purge requires no changes to indexes: we may return */
@@ -455,8 +521,11 @@ row_purge_parse_undo_rec(
/* Read to the partial row the fields that occur in indexes */
- ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row),
- node->heap);
+ if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ ptr = trx_undo_rec_get_partial_row(ptr, clust_index,
+ &(node->row), node->heap);
+ }
+
return(TRUE);
}
@@ -475,6 +544,7 @@ row_purge(
{
dulint roll_ptr;
ibool purge_needed;
+ ibool updated_extern;
ut_ad(node && thr);
@@ -494,7 +564,8 @@ row_purge(
if (node->undo_rec == &trx_purge_dummy_rec) {
purge_needed = FALSE;
} else {
- purge_needed = row_purge_parse_undo_rec(node, thr);
+ purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+ thr);
}
if (purge_needed) {
@@ -503,11 +574,13 @@ row_purge(
node->index = dict_table_get_next_index(
dict_table_get_first_index(node->table));
- if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
- row_purge_upd_exist(node, thr);
- } else {
- ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC);
+ if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
row_purge_del_mark(node, thr);
+
+ } else if (updated_extern
+ || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ row_purge_upd_exist_or_extern(node, thr);
}
if (node->found_clust) {
diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c
index f85789fa0d6..59169ef2a98 100644
--- a/innobase/row/row0row.c
+++ b/innobase/row/row0row.c
@@ -146,15 +146,17 @@ row_build_index_entry(
/***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a
-record in a clustered index. */
+record in a clustered index. NOTE that externally stored (often big)
+fields are always copied to heap. */
dtuple_t*
row_build(
/*======*/
/* out, own: row built; see the NOTE below! */
- ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
- the former copies also the data fields to
- heap as the latter only places pointers to
+ ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or
+ ROW_COPY_ALSO_EXTERNALS,
+ the two last copy also the data fields to
+ heap as the first only places pointers to
data fields on the index page, and thus is
more efficient */
dict_index_t* index, /* in: clustered index */
@@ -170,19 +172,19 @@ row_build(
{
dtuple_t* row;
dict_table_t* table;
- ulint n_fields;
- ulint i;
+ dict_col_t* col;
dfield_t* dfield;
+ ulint n_fields;
byte* field;
ulint len;
ulint row_len;
- dict_col_t* col;
byte* buf;
+ ulint i;
ut_ad(index && rec && heap);
ut_ad(index->type & DICT_CLUSTERED);
- if (type == ROW_COPY_DATA) {
+ if (type != ROW_COPY_POINTERS) {
/* Take a copy of rec to heap */
buf = mem_heap_alloc(heap, rec_get_size(rec));
rec = rec_copy(buf, rec);
@@ -207,6 +209,13 @@ row_build(
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
field = rec_get_nth_field(rec, i, &len);
+ if (type == ROW_COPY_ALSO_EXTERNALS
+ && rec_get_nth_field_extern_bit(rec, i)) {
+
+ field = btr_rec_copy_externally_stored_field(rec,
+ i, &len, heap);
+ }
+
dfield_set_data(dfield, field, len);
}
@@ -215,6 +224,7 @@ row_build(
return(row);
}
+#ifdef notdefined
/***********************************************************************
An inverse function to dict_row_build_index_entry. Builds a row from a
record in a clustered index. */
@@ -229,7 +239,9 @@ row_build_to_tuple(
directly into this record, therefore,
the buffer page of this record must be
at least s-latched and the latch held
- as long as the row dtuple is used! */
+ as long as the row dtuple is used!
+ NOTE 2: does not work with externally
+ stored fields! */
{
dict_table_t* table;
ulint n_fields;
@@ -265,9 +277,11 @@ row_build_to_tuple(
ut_ad(dtuple_check_typed(row));
}
+#endif
/***********************************************************************
-Converts an index record to a typed data tuple. */
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap. */
dtuple_t*
row_rec_to_index_entry(
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
index 5a77fc5daaa..b74bd29a89e 100644
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@@ -2036,7 +2036,8 @@ row_sel_store_mysql_rec(
which was described in prebuilt's
template */
{
- mysql_row_templ_t* templ;
+ mysql_row_templ_t* templ;
+ mem_heap_t* extern_field_heap = NULL;
byte* data;
ulint len;
byte* blob_buf;
@@ -2059,6 +2060,24 @@ row_sel_store_mysql_rec(
data = rec_get_nth_field(rec, templ->rec_field_no, &len);
+ if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) {
+ /* Copy an externally stored field to the temporary
+ heap */
+
+ if (prebuilt->trx->has_search_latch) {
+ rw_lock_s_unlock(&btr_search_latch);
+ prebuilt->trx->has_search_latch = FALSE;
+ }
+
+ extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE);
+
+ data = btr_rec_copy_externally_stored_field(rec,
+ templ->rec_field_no, &len,
+ extern_field_heap);
+
+ ut_a(len != UNIV_SQL_NULL);
+ }
+
if (len != UNIV_SQL_NULL) {
if (templ->type == DATA_BLOB) {
@@ -2081,6 +2100,10 @@ row_sel_store_mysql_rec(
mysql_rec + templ->mysql_col_offset,
templ->mysql_col_len, data, len,
templ->type, templ->is_unsigned);
+
+ if (extern_field_heap) {
+ mem_heap_free(extern_field_heap);
+ }
} else {
mysql_rec[templ->mysql_null_byte_offset] |=
(byte) (templ->mysql_null_bit_mask);
@@ -2450,6 +2473,7 @@ row_search_for_mysql(
ibool unique_search_from_clust_index = FALSE;
ibool mtr_has_extra_clust_latch = FALSE;
ibool moves_up = FALSE;
+ ulint cnt = 0;
mtr_t mtr;
ut_ad(index && pcur && search_tuple);
@@ -2457,6 +2481,11 @@ row_search_for_mysql(
ut_ad(sync_thread_levels_empty_gen(FALSE));
+/* printf("Match mode %lu\n search tuple ", match_mode);
+ dtuple_print(search_tuple);
+
+ printf("N tables locked %lu\n", trx->mysql_n_tables_locked);
+*/
if (direction == 0) {
prebuilt->n_rows_fetched = 0;
prebuilt->n_fetch_cached = 0;
@@ -2528,6 +2557,8 @@ row_search_for_mysql(
mtr_commit(&mtr);
+ /* printf("%s record not found 1\n", index->name); */
+
return(DB_RECORD_NOT_FOUND);
}
@@ -2565,17 +2596,18 @@ row_search_for_mysql(
mtr_commit(&mtr);
+ /* printf("%s shortcut\n", index->name); */
+
return(DB_SUCCESS);
} else if (shortcut == SEL_EXHAUSTED) {
mtr_commit(&mtr);
+ /* printf("%s record not found 2\n",
+ index->name); */
return(DB_RECORD_NOT_FOUND);
}
-
- /* Commit the mini-transaction since it can
- hold latches */
mtr_commit(&mtr);
mtr_start(&mtr);
@@ -2659,7 +2691,12 @@ rec_loop:
cons_read_requires_clust_rec = FALSE;
rec = btr_pcur_get_rec(pcur);
-
+/*
+ printf("Using index %s cnt %lu ", index->name, cnt);
+ printf("; Page no %lu\n",
+ buf_frame_get_page_no(buf_frame_align(rec)));
+ rec_print(rec);
+*/
if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
/* The infimum record on a page cannot be in the result set,
@@ -2700,12 +2737,15 @@ rec_loop:
/* Test if the index record matches completely to search_tuple
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+ /* printf("Comparing rec and search tuple\n"); */
+
if (0 != cmp_dtuple_rec(search_tuple, rec)) {
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
-
+ /* printf("%s record not found 3\n", index->name); */
+
goto normal_return;
}
@@ -2716,6 +2756,7 @@ rec_loop:
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
+ /* printf("%s record not found 4\n", index->name); */
goto normal_return;
}
@@ -2884,6 +2925,8 @@ next_rec:
moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
moves_up, &mtr);
if (moved) {
+ cnt++;
+
goto rec_loop;
}
}
@@ -2906,6 +2949,8 @@ next_rec:
goto normal_return;
}
+ cnt++;
+
goto rec_loop;
/*-------------------------------------------------------------*/
lock_wait_or_error:
@@ -2931,7 +2976,9 @@ lock_wait_or_error:
goto rec_loop;
}
-
+
+ /* printf("Using index %s cnt %lu ret value %lu err\n", index->name,
+ cnt, err); */
return(err);
normal_return:
@@ -2945,5 +2992,7 @@ normal_return:
ret = DB_SUCCESS;
}
+ /* printf("Using index %s cnt %lu ret value %lu\n", index->name,
+ cnt, err); */
return(ret);
}
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
index c9330318ac0..47807877779 100644
--- a/innobase/row/row0uins.c
+++ b/innobase/row/row0uins.c
@@ -242,11 +242,12 @@ row_undo_ins_parse_undo_rec(
dulint table_id;
ulint type;
ulint dummy;
+ ibool dummy_extern;
ut_ad(node && thr);
- ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no,
- &table_id);
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+ &dummy_extern, &undo_no, &table_id);
ut_ad(type == TRX_UNDO_INSERT_REC);
node->rec_type = type;
@@ -284,9 +285,9 @@ row_undo_ins(
row_undo_ins_parse_undo_rec(node, thr);
if (node->table == NULL) {
- found = FALSE;
+ found = FALSE;
} else {
- found = row_undo_search_clust_to_pcur(node, thr);
+ found = row_undo_search_clust_to_pcur(node, thr);
}
if (!found) {
diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c
index c8db428bade..0221c51b985 100644
--- a/innobase/row/row0umod.c
+++ b/innobase/row/row0umod.c
@@ -94,12 +94,12 @@ row_undo_mod_clust_low(
mtr_t* mtr, /* in: mtr */
ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
{
+ big_rec_t* dummy_big_rec;
dict_index_t* index;
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
ibool success;
- ibool do_remove;
index = dict_table_get_first_index(node->table);
@@ -110,49 +110,80 @@ row_undo_mod_clust_low(
ut_ad(success);
+ if (mode == BTR_MODIFY_LEAF) {
+
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, &dummy_big_rec, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+
+ return(err);
+}
+
+/***************************************************************
+Removes a clustered index record after undo if possible. */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+ /* out: DB_SUCCESS, DB_FAIL, or error code:
+ we may run out of file space */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ if (!success) {
+
+ return(DB_SUCCESS);
+ }
+
/* Find out if we can remove the whole clustered index record */
if (node->rec_type == TRX_UNDO_UPD_DEL_REC
&& !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
- do_remove = TRUE;
+ /* Ok, we can remove */
} else {
- do_remove = FALSE;
+ return(DB_SUCCESS);
}
if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, mtr);
- if (do_remove) {
- success = btr_cur_optimistic_delete(btr_cur, mtr);
-
- if (success) {
- err = DB_SUCCESS;
- } else {
- err = DB_FAIL;
- }
+ if (success) {
+ err = DB_SUCCESS;
} else {
- err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG
- | BTR_KEEP_SYS_FLAG,
- btr_cur, node->update,
- node->cmpl_info, thr, mtr);
+ err = DB_FAIL;
}
} else {
ut_ad(mode == BTR_MODIFY_TREE);
- if (do_remove) {
- btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
- /* The delete operation may fail if we have little
- file space left: TODO: easiest to crash the database
- and restart with more file space */
- } else {
- err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG
- | BTR_KEEP_SYS_FLAG,
- btr_cur, node->update,
- node->cmpl_info, thr, mtr);
- }
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
}
return(err);
@@ -204,10 +235,31 @@ row_undo_mod_clust(
err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
}
- node->state = UNDO_NODE_FETCH_NEXT;
-
btr_pcur_commit_specify_mtr(pcur, &mtr);
+ if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_LEAF);
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a
+ pessimistic descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_TREE);
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ }
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+
trx_undo_rec_release(node->trx, node->undo_no);
if (more_vers && err == DB_SUCCESS) {
@@ -388,7 +440,6 @@ row_undo_mod_del_unmark_sec(
mem_free(err_buf);
} else {
-
btr_cur = btr_pcur_get_btr_cur(&pcur);
err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
@@ -546,11 +597,12 @@ row_undo_mod_parse_undo_rec(
ulint info_bits;
ulint type;
ulint cmpl_info;
+ ibool dummy_extern;
ut_ad(node && thr);
ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
- &undo_no, &table_id);
+ &dummy_extern, &undo_no, &table_id);
node->rec_type = type;
node->table = dict_table_get_on_id(table_id, thr_get_trx(thr));
@@ -598,10 +650,9 @@ row_undo_mod(
row_undo_mod_parse_undo_rec(node, thr);
if (node->table == NULL) {
- found = FALSE;
+ found = FALSE;
} else {
-
- found = row_undo_search_clust_to_pcur(node, thr);
+ found = row_undo_search_clust_to_pcur(node, thr);
}
if (!found) {
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
index 10ac3af6de9..5119254f405 100644
--- a/innobase/row/row0undo.c
+++ b/innobase/row/row0undo.c
@@ -124,6 +124,8 @@ row_undo_node_create(
undo->state = UNDO_NODE_FETCH_NEXT;
undo->trx = trx;
+ btr_pcur_init(&(undo->pcur));
+
undo->heap = mem_heap_create(256);
return(undo);
@@ -303,6 +305,16 @@ row_undo_step(
if (err != DB_SUCCESS) {
/* SQL error detected */
+ fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", err);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ fprintf(stderr,
+ "InnoDB: Error 13 means out of tablespace.\n"
+ "InnoDB: Consider increasing your tablespace.\n");
+
+ exit(1);
+ }
+
ut_a(0);
return(NULL);
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
index 5bca2a24c01..d339474df61 100644
--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -90,8 +90,10 @@ upd_node_create(
node->in_mysql_interface = FALSE;
node->row = NULL;
+ node->ext_vec = NULL;
node->index = NULL;
-
+ node->update = NULL;
+
node->select = NULL;
node->heap = mem_heap_create(128);
@@ -160,7 +162,8 @@ row_upd_index_entry_sys_field(
}
/***************************************************************
-Returns TRUE if row update changes size of some field in index. */
+Returns TRUE if row update changes size of some field in index
+or if some field to be updated is stored externally in rec or update. */
ibool
row_upd_changes_field_size(
@@ -199,6 +202,16 @@ row_upd_changes_field_size(
return(TRUE);
}
+
+ if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) {
+
+ return(TRUE);
+ }
+
+ if (upd_field->extern_storage) {
+
+ return(TRUE);
+ }
}
return(FALSE);
@@ -441,6 +454,34 @@ row_upd_index_parse(
return(ptr);
}
+
+/*******************************************************************
+Returns TRUE if ext_vec contains i. */
+UNIV_INLINE
+ibool
+upd_ext_vec_contains(
+/*=================*/
+ /* out: TRUE if i is in ext_vec */
+ ulint* ext_vec, /* in: array of indexes or NULL */
+ ulint n_ext_vec, /* in: number of numbers in ext_vec */
+ ulint i) /* in: a number */
+{
+ ulint j;
+
+ if (ext_vec == NULL) {
+
+ return(FALSE);
+ }
+
+ for (j = 0; j < n_ext_vec; j++) {
+ if (ext_vec[j] == i) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
/*******************************************************************
Builds an update vector from those fields, excluding the roll ptr and
@@ -454,6 +495,9 @@ row_upd_build_difference(
fields, excluding roll ptr and trx id */
dict_index_t* index, /* in: clustered index */
dtuple_t* entry, /* in: entry to insert */
+ ulint* ext_vec,/* in: array containing field numbers of
+ externally stored fields in entry, or NULL */
+ ulint n_ext_vec,/* in: number of fields in ext_vec */
rec_t* rec, /* in: clustered index record */
mem_heap_t* heap) /* in: memory heap from which allocated */
{
@@ -480,16 +524,25 @@ row_upd_build_difference(
for (i = 0; i < dtuple_get_n_fields(entry); i++) {
data = rec_get_nth_field(rec, i, &len);
+
dfield = dtuple_get_nth_field(entry, i);
- if ((i != trx_id_pos) && (i != roll_ptr_pos)
- && !dfield_data_is_equal(dfield, len, data)) {
+ if ((rec_get_nth_field_extern_bit(rec, i)
+ != upd_ext_vec_contains(ext_vec, n_ext_vec, i))
+ || ((i != trx_id_pos) && (i != roll_ptr_pos)
+ && !dfield_data_is_equal(dfield, len, data))) {
upd_field = upd_get_nth_field(update, n_diff);
dfield_copy(&(upd_field->new_val), dfield);
upd_field_set_field_no(upd_field, i, index);
+
+ if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) {
+ upd_field->extern_storage = TRUE;
+ } else {
+ upd_field->extern_storage = FALSE;
+ }
n_diff++;
}
@@ -630,9 +683,7 @@ row_upd_changes_ord_field(
}
/***************************************************************
-Checks if an update vector changes an ordering field of an index record.
-This function is fast if the update vector is short or the number of ordering
-fields in the index is small. Otherwise, this can be quadratic. */
+Checks if an update vector changes an ordering field of an index record. */
ibool
row_upd_changes_some_index_ord_field(
@@ -642,19 +693,24 @@ row_upd_changes_some_index_ord_field(
dict_table_t* table, /* in: table */
upd_t* update) /* in: update vector for the row */
{
+ upd_field_t* upd_field;
dict_index_t* index;
-
+ ulint i;
+
index = dict_table_get_first_index(table);
- while (index) {
- if (row_upd_changes_ord_field(NULL, index, update)) {
+ for (i = 0; i < upd_get_n_fields(update); i++) {
- return(TRUE);
- }
+ upd_field = upd_get_nth_field(update, i);
- index = dict_table_get_next_index(index);
- }
+ if (dict_field_get_col(dict_index_get_nth_field(index,
+ upd_field->field_no))
+ ->ord_part) {
+ return(TRUE);
+ }
+ }
+
return(FALSE);
}
@@ -710,15 +766,17 @@ row_upd_eval_new_vals(
/***************************************************************
Stores to the heap the row on which the node->pcur is positioned. */
-UNIV_INLINE
+static
void
row_upd_store_row(
/*==============*/
upd_node_t* node) /* in: row update node */
{
dict_index_t* clust_index;
+ upd_t* update;
+ rec_t* rec;
- ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES);
+ ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
if (node->row != NULL) {
mem_heap_empty(node->heap);
@@ -727,8 +785,20 @@ row_upd_store_row(
clust_index = dict_table_get_first_index(node->table);
- node->row = row_build(ROW_COPY_DATA, clust_index,
- btr_pcur_get_rec(node->pcur), node->heap);
+ rec = btr_pcur_get_rec(node->pcur);
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap);
+
+ node->ext_vec = mem_heap_alloc(node->heap, rec_get_n_fields(rec));
+
+ if (node->is_delete) {
+ update = NULL;
+ } else {
+ update = node->update;
+ }
+
+ node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec,
+ rec, update);
}
/***************************************************************
@@ -812,7 +882,7 @@ row_upd_sec_index_entry(
row_upd_index_replace_new_col_vals(entry, index, node->update);
/* Insert new index entry */
- err = row_ins_index_entry(index, entry, thr);
+ err = row_ins_index_entry(index, entry, NULL, 0, thr);
mem_heap_free(heap);
@@ -870,6 +940,8 @@ row_upd_clust_rec_by_insert(
dict_table_t* table;
mem_heap_t* heap;
dtuple_t* entry;
+ ulint* ext_vec;
+ ulint n_ext_vec;
ulint err;
ut_ad(node);
@@ -897,14 +969,18 @@ row_upd_clust_rec_by_insert(
heap = mem_heap_create(1024);
+ ext_vec = mem_heap_alloc(heap,
+ sizeof(ulint) * dtuple_get_n_fields(node->row));
+ n_ext_vec = 0;
+
entry = row_build_index_entry(node->row, index, heap);
row_upd_clust_index_replace_new_col_vals(entry, node->update);
-
+
row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
- err = row_ins_index_entry(index, entry, thr);
-
+ err = row_ins_index_entry(index, entry, node->ext_vec,
+ node->n_ext_vec, thr);
mem_heap_free(heap);
return(err);
@@ -924,6 +1000,7 @@ row_upd_clust_rec(
que_thr_t* thr, /* in: query thread */
mtr_t* mtr) /* in: mtr; gets committed here */
{
+ big_rec_t* big_rec = NULL;
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
ulint err;
@@ -973,9 +1050,24 @@ row_upd_clust_rec(
ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
- node->update, node->cmpl_info, thr, mtr);
+ &big_rec, node->update,
+ node->cmpl_info, thr, mtr);
mtr_commit(mtr);
+ if (err == DB_SUCCESS && big_rec) {
+ mtr_start(mtr);
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+ err = btr_store_big_rec_extern_fields(index,
+ btr_cur_get_rec(btr_cur),
+ big_rec, mtr);
+ mtr_commit(mtr);
+ }
+
+ if (big_rec) {
+ dtuple_big_rec_free(big_rec);
+ }
+
return(err);
}
@@ -1194,10 +1286,12 @@ row_upd(
ut_ad(node && thr);
if (node->in_mysql_interface) {
+
/* We do not get the cmpl_info value from the MySQL
interpreter: we must calculate it on the fly: */
- if (row_upd_changes_some_index_ord_field(node->table,
+ if (node->is_delete ||
+ row_upd_changes_some_index_ord_field(node->table,
node->update)) {
node->cmpl_info = 0;
} else {
@@ -1239,6 +1333,7 @@ function_exit:
if (node->row != NULL) {
mem_heap_empty(node->heap);
node->row = NULL;
+ node->n_ext_vec = 0;
}
node->state = UPD_NODE_UPDATE_CLUSTERED;
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index 028fae010d5..8dd9c9f3feb 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -93,6 +93,8 @@ ulint srv_lock_wait_timeout = 1024 * 1024 * 1024;
char* srv_unix_file_flush_method_str = NULL;
ulint srv_unix_file_flush_method = 0;
+ibool srv_use_doublewrite_buf = TRUE;
+
ibool srv_set_thread_priorities = TRUE;
int srv_query_thread_priority = 0;
/*-------------------------------------------*/
@@ -109,6 +111,8 @@ ibool srv_print_buf_io = FALSE;
ibool srv_print_log_io = FALSE;
ibool srv_print_latch_waits = FALSE;
+ibool srv_print_innodb_monitor = FALSE;
+
/* The parameters below are obsolete: */
ibool srv_print_parsed_sql = FALSE;
@@ -1492,7 +1496,6 @@ srv_init(void)
slot = srv_mysql_table + i;
slot->in_use = FALSE;
slot->event = os_event_create(NULL);
- slot->suspended = FALSE;
ut_a(slot->event);
}
@@ -1661,7 +1664,6 @@ srv_suspend_mysql_thread(
slot->thr = thr;
os_event_reset(event);
- slot->suspended = TRUE;
slot->suspend_time = ut_time();
@@ -1693,27 +1695,6 @@ srv_suspend_mysql_thread(
return(FALSE);
}
-os_event_t
-srv_mysql_thread_event_get(void)
-{
- srv_slot_t* slot;
- os_event_t event;
-
- mutex_enter(&kernel_mutex);
-
- slot = srv_table_reserve_slot_for_mysql();
-
- event = slot->event;
-
- os_event_reset(event);
-
- slot->suspended = TRUE;
-
- mutex_exit(&kernel_mutex);
-
- return(event);
-}
-
/************************************************************************
Releases a MySQL OS thread waiting for a lock to be released, if the
thread is already suspended. */
@@ -1737,7 +1718,6 @@ srv_release_mysql_thread_if_suspended(
/* Found */
os_event_set(slot->event);
- slot->suspended = FALSE;
return;
}
@@ -1746,59 +1726,6 @@ srv_release_mysql_thread_if_suspended(
/* not found */
}
-void
-srv_mysql_thread_release(void)
-/*==========================*/
-{
- srv_slot_t* slot;
- ulint i;
-
- mutex_enter(&kernel_mutex);
-
- for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
- slot = srv_mysql_table + i;
-
- if (slot->in_use && slot->suspended) {
- /* Found */
- slot->suspended = FALSE;
- mutex_exit(&kernel_mutex);
-
- os_event_set(slot->event);
-
- return;
- }
- }
-
- ut_a(0);
-}
-
-void
-srv_mysql_thread_slot_free(
-/*==========================*/
- os_event_t event)
-{
- srv_slot_t* slot;
- ulint i;
-
- mutex_enter(&kernel_mutex);
-
- for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
- slot = srv_mysql_table + i;
-
- if (slot->in_use && slot->event == event) {
- /* Found */
- slot->in_use = FALSE;
- mutex_exit(&kernel_mutex);
-
- return;
- }
- }
-
- ut_a(0);
-}
-
/*************************************************************************
A thread which wakes up threads whose lock wait may have lasted too long. */
@@ -1924,6 +1851,7 @@ srv_master_thread(
ulint i;
time_t last_flush_time;
time_t current_time;
+ time_t last_monitor_time;
UT_NOT_USED(arg);
@@ -1936,6 +1864,8 @@ srv_master_thread(
mutex_exit(&kernel_mutex);
os_event_set(srv_sys->operational);
+
+ last_monitor_time = time(NULL);
loop:
mutex_enter(&kernel_mutex);
@@ -1975,8 +1905,18 @@ loop:
while (n_pages_purged) {
/* TODO: replace this by a check if we are running
out of file space! */
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB starts purge\n");
+ }
+
n_pages_purged = trx_purge();
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB purged %lu pages\n", n_pages_purged);
+ }
+
current_time = time(NULL);
if (difftime(current_time, last_flush_time) > 1) {
@@ -1986,14 +1926,40 @@ loop:
}
background_loop:
- /*
- sync_array_print_info(sync_primary_wait_array);
- os_aio_print();
- buf_print_io();
- */
/* In this loop we run background operations while the server
is quiet */
+ current_time = time(NULL);
+
+ if (srv_print_innodb_monitor
+ && difftime(current_time, last_monitor_time) > 8) {
+
+ printf("================================\n");
+ last_monitor_time = time(NULL);
+ ut_print_timestamp(stdout);
+
+ printf(" INNODB MONITOR OUTPUT\n"
+ "================================\n");
+ printf("--------------------------\n"
+ "LOCKS HELD BY TRANSACTIONS\n"
+ "--------------------------\n");
+ lock_print_info();
+ printf("-----------------------------------------------\n"
+ "CURRENT SEMAPHORES RESERVED AND SEMAPHORE WAITS\n"
+ "-----------------------------------------------\n");
+ sync_print();
+ printf("CURRENT PENDING FILE I/O'S\n"
+ "--------------------------\n");
+ os_aio_print();
+ printf("-----------\n"
+ "BUFFER POOL\n"
+ "-----------\n");
+ buf_print_io();
+ printf("----------------------------\n"
+ "END OF INNODB MONITOR OUTPUT\n"
+ "============================\n");
+ }
+
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
mutex_exit(&kernel_mutex);
@@ -2005,8 +1971,18 @@ background_loop:
/* The server has been quiet for a while: start running background
operations */
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB starts purge\n");
+ }
+
n_pages_purged = trx_purge();
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB purged %lu pages\n", n_pages_purged);
+ }
+
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
mutex_exit(&kernel_mutex);
@@ -2014,8 +1990,18 @@ background_loop:
}
mutex_exit(&kernel_mutex);
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB starts insert buffer merge\n");
+ }
+
n_bytes_merged = ibuf_contract(TRUE);
+ if (srv_print_innodb_monitor) {
+ ut_print_timestamp(stdout);
+ printf(" InnoDB merged %lu bytes\n", n_bytes_merged);
+ }
+
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
mutex_exit(&kernel_mutex);
@@ -2023,7 +2009,7 @@ background_loop:
}
mutex_exit(&kernel_mutex);
- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 20, ut_dulint_max);
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
@@ -2052,14 +2038,12 @@ background_loop:
/* mem_print_new_info();
*/
-
-/* fsp_print(0); */
-
-/* fprintf(stderr, "Validating tablespace\n");
+/*
+ fsp_print(0);
+ fprintf(stderr, "Validating tablespace\n");
fsp_validate(0);
fprintf(stderr, "Validation ok\n");
*/
-
#ifdef UNIV_SEARCH_PERF_STAT
/* btr_search_print_info(); */
#endif
diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
index a343f2115e7..a79a808ba2e 100644
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -1,7 +1,7 @@
/************************************************************************
Starts the InnoDB database server
-(c) 1996-2000 InnoDB Oy
+(c) 1996-2000 Innobase Oy
Created 2/16/1996 Heikki Tuuri
*************************************************************************/
@@ -203,8 +203,8 @@ open_or_create_log_file(
sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", i);
- files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, &ret);
-
+ files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL,
+ OS_LOG_FILE, &ret);
if (ret == FALSE) {
if (os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) {
fprintf(stderr,
@@ -214,7 +214,8 @@ open_or_create_log_file(
}
files[i] = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_AIO, &ret);
+ name, OS_FILE_OPEN, OS_FILE_AIO,
+ OS_LOG_FILE, &ret);
if (!ret) {
fprintf(stderr,
"InnoDB: Error in opening %s\n", name);
@@ -239,7 +240,7 @@ open_or_create_log_file(
fprintf(stderr,
"InnoDB: Log file %s did not exist: new to be created\n",
name);
- printf("InnoDB: Setting log file %s size to %lu\n",
+ fprintf(stderr, "InnoDB: Setting log file %s size to %lu\n",
name, UNIV_PAGE_SIZE * srv_log_file_size);
ret = os_file_set_size(name, files[i],
@@ -330,27 +331,28 @@ open_or_create_data_files(
sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]);
- if (srv_data_file_is_raw_partition[i] == 0) {
-
- files[i] = os_file_create(name, OS_FILE_CREATE,
- OS_FILE_NORMAL, &ret);
- } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
- ret = FALSE;
- } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+ files[i] = os_file_create(name, OS_FILE_CREATE,
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret);
- files[i] = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_NORMAL, &ret);
+ if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+ /* The partition is opened, not created; then it is
+ written over */
- if (!ret) {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ if (!ret) {
fprintf(stderr,
"InnoDB: Error in opening %s\n", name);
return(DB_ERROR);
- }
+ }
+ } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+ ret = FALSE;
}
if (ret == FALSE) {
- if (srv_data_file_is_raw_partition[i] == 0
+ if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW
&& os_file_get_last_error() !=
OS_FILE_ALREADY_EXISTS) {
fprintf(stderr,
@@ -370,8 +372,8 @@ open_or_create_data_files(
}
files[i] = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_NORMAL, &ret);
-
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
if (!ret) {
fprintf(stderr,
"InnoDB: Error in opening %s\n", name);
@@ -379,18 +381,21 @@ open_or_create_data_files(
return(DB_ERROR);
}
- ret = os_file_get_size(files[i], &size, &size_high);
- ut_a(ret);
+ if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW) {
+
+ ret = os_file_get_size(files[i], &size,
+ &size_high);
+ ut_a(ret);
- if (srv_data_file_is_raw_partition[i] == 0
- && (size != UNIV_PAGE_SIZE * srv_data_file_sizes[i]
- || size_high != 0)) {
-
- fprintf(stderr,
+ if (size !=
+ UNIV_PAGE_SIZE * srv_data_file_sizes[i]
+ || size_high != 0) {
+ fprintf(stderr,
"InnoDB: Error: data file %s is of different size\n"
"InnoDB: than specified in the .cnf file!\n", name);
- return(DB_ERROR);
+ return(DB_ERROR);
+ }
}
fil_read_flushed_lsn_and_arch_log_no(files[i],
@@ -403,7 +408,8 @@ open_or_create_data_files(
if (i > 0) {
fprintf(stderr,
- "InnoDB: Data file %s did not exist: new to be created\n", name);
+ "InnoDB: Data file %s did not exist: new to be created\n",
+ name);
} else {
fprintf(stderr,
"InnoDB: The first specified data file %s did not exist:\n"
@@ -411,10 +417,10 @@ open_or_create_data_files(
*create_new_db = TRUE;
}
- printf("InnoDB: Setting file %s size to %lu\n",
+ fprintf(stderr, "InnoDB: Setting file %s size to %lu\n",
name, UNIV_PAGE_SIZE * srv_data_file_sizes[i]);
- printf(
+ fprintf(stderr,
"InnoDB: Database physically writes the file full: wait...\n");
ret = os_file_set_size(name, files[i],
@@ -555,19 +561,22 @@ innobase_start_or_create_for_mysql(void)
srv_startup_is_before_trx_rollback_phase = TRUE;
if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) {
- srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+ srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+
} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) {
- srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+ srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+
} else if (0 == ut_strcmp(srv_unix_file_flush_method_str,
"littlesync")) {
- srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+ srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+
} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) {
- srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+ srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
} else {
- fprintf(stderr,
- "InnoDB: Unrecognized value for innodb_unix_file_flush_method\n");
-
- return(DB_ERROR);
+ fprintf(stderr,
+ "InnoDB: Unrecognized value %s for innodb_flush_method\n",
+ srv_unix_file_flush_method_str);
+ return(DB_ERROR);
}
/*
@@ -593,14 +602,15 @@ innobase_start_or_create_for_mysql(void)
#ifdef __WIN__
if (os_get_os_version() == OS_WIN95
|| os_get_os_version() == OS_WIN31) {
- /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use
- simulated aio */
- os_aio_use_native_aio = FALSE;
- srv_n_file_io_threads = 4;
+ /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use
+ simulated aio */
+
+ os_aio_use_native_aio = FALSE;
+ srv_n_file_io_threads = 4;
} else {
- /* On NT and Win 2000 always use aio */
- os_aio_use_native_aio = TRUE;
+ /* On NT and Win 2000 always use aio */
+ os_aio_use_native_aio = TRUE;
}
#endif
if (!os_aio_use_native_aio) {
@@ -652,14 +662,21 @@ innobase_start_or_create_for_mysql(void)
sum_of_new_sizes = 0;
for (i = 0; i < srv_n_data_files; i++) {
- sum_of_new_sizes += srv_data_file_sizes[i];
+ if (srv_data_file_sizes[i] >= 262144) {
+ fprintf(stderr,
+ "InnoDB: Error: file size must be < 4 GB, or on some OS's < 2 GB\n");
+
+ return(DB_ERROR);
+ }
+
+ sum_of_new_sizes += srv_data_file_sizes[i];
}
if (sum_of_new_sizes < 640) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Error: tablespace size must be at least 10 MB\n");
- return(DB_ERROR);
+ return(DB_ERROR);
}
err = open_or_create_data_files(&create_new_db,
@@ -673,6 +690,15 @@ innobase_start_or_create_for_mysql(void)
return((int) err);
}
+ if (!create_new_db) {
+ /* If we are using the doublewrite method, we will
+ check if there are half-written pages in data files,
+ and restore them from the doublewrite buffer if
+ possible */
+
+ trx_sys_doublewrite_restore_corrupt_pages();
+ }
+
srv_normalize_path_for_win(srv_arch_dir);
srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
@@ -742,7 +768,6 @@ innobase_start_or_create_for_mysql(void)
mutex_exit(&(log_sys->mutex));
}
- /* mutex_create(&row_mysql_thread_mutex); */
sess_sys_init_at_db_start();
if (create_new_db) {
@@ -834,7 +859,7 @@ innobase_start_or_create_for_mysql(void)
}
if (srv_measure_contention) {
- /* os_thread_create(&test_measure_cont, NULL, thread_ids +
+ /* os_thread_create(&test_measure_cont, NULL, thread_ids +
SRV_MAX_N_IO_THREADS); */
}
@@ -849,16 +874,20 @@ innobase_start_or_create_for_mysql(void)
/* Create the thread which watches the timeouts for lock waits */
os_thread_create(&srv_lock_timeout_monitor_thread, NULL,
thread_ids + 2 + SRV_MAX_N_IO_THREADS);
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Started\n");
-
srv_was_started = TRUE;
srv_is_being_started = FALSE;
sync_order_checks_on = TRUE;
+ if (srv_use_doublewrite_buf && trx_doublewrite == NULL) {
+ trx_sys_create_doublewrite_buf();
+ }
+
/* buf_debug_prints = TRUE; */
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Started\n");
+
return((int) DB_SUCCESS);
}
diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
index a77cc76ed37..dc49ce2197e 100644
--- a/innobase/sync/sync0rw.c
+++ b/innobase/sync/sync0rw.c
@@ -810,11 +810,10 @@ rw_lock_print(
ulint count = 0;
rw_lock_debug_t* info;
- printf("----------------------------------------------\n");
+ printf("-------------------------------------------------\n");
printf("RW-LOCK INFO\n");
printf("RW-LOCK: %lx ", (ulint)lock);
- mutex_enter(&(lock->mutex));
if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
|| (rw_lock_get_reader_count(lock) != 0)
|| (rw_lock_get_waiters(lock) != 0)) {
@@ -831,8 +830,6 @@ rw_lock_print(
info = UT_LIST_GET_NEXT(list, info);
}
}
-
- mutex_exit(&(lock->mutex));
#endif
}
diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
index c3a1ac3b47f..a125f65be41 100644
--- a/innobase/sync/sync0sync.c
+++ b/innobase/sync/sync0sync.c
@@ -158,7 +158,7 @@ struct sync_thread_struct{
};
/* Number of slots reserved for each OS thread in the sync level array */
-#define SYNC_THREAD_N_LEVELS 256
+#define SYNC_THREAD_N_LEVELS 10000
struct sync_level_struct{
void* latch; /* pointer to a mutex or an rw-lock; NULL means that
@@ -768,6 +768,9 @@ sync_thread_levels_g(
thread */
ulint limit) /* in: level limit */
{
+ char* file_name;
+ ulint line;
+ ulint thread_id;
sync_level_t* slot;
rw_lock_t* lock;
mutex_t* mutex;
@@ -783,8 +786,29 @@ sync_thread_levels_g(
lock = slot->latch;
mutex = slot->latch;
- ut_error;
-
+ printf(
+ "InnoDB error: sync levels should be > %lu but a level is %lu\n",
+ limit, slot->level);
+
+ if (mutex->magic_n == MUTEX_MAGIC_N) {
+ printf("Mutex created at %s %lu\n", &(mutex->cfile_name),
+ mutex->cline);
+
+ if (mutex_get_lock_word(mutex) != 0) {
+
+ mutex_get_debug_info(mutex,
+ &file_name, &line, &thread_id);
+
+ printf("InnoDB: Locked mutex: addr %lx thread %ld file %s line %ld\n",
+ (ulint)mutex, thread_id,
+ file_name, line);
+ } else {
+ printf("Not locked\n");
+ }
+ } else {
+ rw_lock_print(lock);
+ }
+
return(FALSE);
}
}
@@ -973,6 +997,8 @@ sync_thread_add_level(
ut_a(sync_thread_levels_g(array, SYNC_ANY_LATCH));
} else if (level == SYNC_TRX_SYS_HEADER) {
ut_a(sync_thread_levels_contain(array, SYNC_KERNEL));
+ } else if (level == SYNC_DOUBLEWRITE) {
+ ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE));
} else if (level == SYNC_BUF_BLOCK) {
ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL)
&& sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1))
@@ -1000,6 +1026,8 @@ sync_thread_add_level(
} else if (level == SYNC_FSP) {
ut_a(sync_thread_levels_contain(array, SYNC_FSP)
|| sync_thread_levels_g(array, SYNC_FSP));
+ } else if (level == SYNC_EXTERN_STORAGE) {
+ ut_a(TRUE);
} else if (level == SYNC_TRX_UNDO_PAGE) {
ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
|| sync_thread_levels_contain(array, SYNC_RSEG)
@@ -1221,10 +1249,10 @@ void
sync_print(void)
/*============*/
{
- printf("SYNC INFO:------------------------------------------\n");
+ printf("SYNC INFO:\n");
mutex_list_print_info();
rw_lock_list_print_info();
sync_array_print_info(sync_primary_wait_array);
sync_print_wait_info();
- printf("----------------------------------------------------\n");
+ printf("-----------------------------------------------------\n");
}
diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c
index f65943f27e3..032b3ffcf3b 100644
--- a/innobase/trx/trx0purge.c
+++ b/innobase/trx/trx0purge.c
@@ -692,6 +692,9 @@ trx_purge_choose_next_log(void)
min_rseg = rseg;
min_trx_no = rseg->last_trx_no;
space = rseg->space;
+ ut_a(space == 0); /* We assume in purge of
+ externally stored fields
+ that space id == 0 */
page_no = rseg->last_page_no;
offset = rseg->last_offset;
}
@@ -820,6 +823,10 @@ trx_purge_get_next_rec(
}
cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+ if (trx_undo_rec_get_extern_storage(rec2)) {
+ break;
+ }
if ((type == TRX_UNDO_UPD_EXIST_REC)
&& !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c
index c31d786011d..64febb8f523 100644
--- a/innobase/trx/trx0rec.c
+++ b/innobase/trx/trx0rec.c
@@ -292,6 +292,8 @@ trx_undo_rec_get_pars(
TRX_UNDO_INSERT_REC, ... */
ulint* cmpl_info, /* out: compiler info, relevant only
for update type records */
+ ibool* updated_extern, /* out: TRUE if we updated an
+ externally stored fild */
dulint* undo_no, /* out: undo log record number */
dulint* table_id) /* out: table id */
{
@@ -303,7 +305,14 @@ trx_undo_rec_get_pars(
type_cmpl = mach_read_from_1(ptr);
ptr++;
-
+
+ if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
+ *updated_extern = TRUE;
+ type_cmpl -= TRX_UNDO_UPD_EXTERN;
+ } else {
+ *updated_extern = FALSE;
+ }
+
*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
@@ -336,7 +345,11 @@ trx_undo_rec_get_col_val(
*field = ptr;
if (*len != UNIV_SQL_NULL) {
- ptr += *len;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += (*len - UNIV_EXTERN_STORAGE_FIELD);
+ } else {
+ ptr += *len;
+ }
}
return(ptr);
@@ -452,6 +465,7 @@ trx_undo_page_report_modify(
ulint col_no;
byte* old_ptr;
ulint type_cmpl;
+ byte* type_cmpl_ptr;
ulint i;
ut_ad(index->type & DICT_CLUSTERED);
@@ -491,6 +505,8 @@ trx_undo_page_report_modify(
mach_write_to_1(ptr, type_cmpl);
+ type_cmpl_ptr = ptr;
+
ptr++;
len = mach_dulint_write_much_compressed(ptr, trx->undo_no);
ptr += len;
@@ -577,7 +593,23 @@ trx_undo_page_report_modify(
return(0);
}
- len = mach_write_compressed(ptr, flen);
+ if (rec_get_nth_field_extern_bit(rec, pos)) {
+ /* If a field has external storage, we add to
+ flen the flag */
+
+ len = mach_write_compressed(ptr,
+ UNIV_EXTERN_STORAGE_FIELD + flen);
+
+ /* Notify purge that it eventually has to free the old
+ externally stored field */
+
+ (trx->update_undo)->del_marks = TRUE;
+
+ *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN;
+ } else {
+ len = mach_write_compressed(ptr, flen);
+ }
+
ptr += len;
if (flen != UNIV_SQL_NULL) {
@@ -825,6 +857,13 @@ trx_undo_update_rec_get_update(
upd_field_set_field_no(upd_field, field_no, index);
+ if (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD) {
+
+ upd_field->extern_storage = TRUE;
+
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+ }
+
dfield_set_data(&(upd_field->new_val), field, len);
}
@@ -1222,8 +1261,10 @@ trx_undo_prev_version_build(
byte* ptr;
ulint info_bits;
ulint cmpl_info;
+ ibool dummy_extern;
byte* buf;
ulint err;
+ ulint i;
ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
ut_ad(mtr_memo_contains(index_mtr, buf_block_align(index_rec),
@@ -1252,8 +1293,9 @@ trx_undo_prev_version_build(
return(err);
}
- ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, &undo_no,
- &table_id);
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
&info_bits);
ptr = trx_undo_rec_skip_row_ref(ptr, index);
@@ -1278,5 +1320,15 @@ trx_undo_prev_version_build(
row_upd_rec_in_place(*old_vers, update);
}
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ if (upd_get_nth_field(update, i)->extern_storage) {
+
+ rec_set_nth_field_extern_bit(*old_vers,
+ upd_get_nth_field(update, i)->field_no,
+ TRUE, NULL);
+ }
+ }
+
return(DB_SUCCESS);
}
diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c
index 99ec5b50237..b056975d28a 100644
--- a/innobase/trx/trx0sys.c
+++ b/innobase/trx/trx0sys.c
@@ -19,9 +19,326 @@ Created 3/26/1996 Heikki Tuuri
#include "trx0undo.h"
#include "srv0srv.h"
#include "trx0purge.h"
+#include "log0log.h"
/* The transaction system */
-trx_sys_t* trx_sys = NULL;
+trx_sys_t* trx_sys = NULL;
+trx_doublewrite_t* trx_doublewrite = NULL;
+
+/********************************************************************
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+trx_doublewrite_init(
+/*=================*/
+ byte* doublewrite) /* in: pointer to the doublewrite buf
+ header on trx sys page */
+{
+ trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
+
+ mutex_create(&(trx_doublewrite->mutex));
+ mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE);
+
+ trx_doublewrite->first_free = 0;
+
+ trx_doublewrite->block1 = mach_read_from_4(
+ doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1);
+ trx_doublewrite->block2 = mach_read_from_4(
+ doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2);
+ trx_doublewrite->write_buf_unaligned =
+ ut_malloc(
+ (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ * UNIV_PAGE_SIZE);
+
+ trx_doublewrite->write_buf = ut_align(
+ trx_doublewrite->write_buf_unaligned,
+ UNIV_PAGE_SIZE);
+ trx_doublewrite->buf_block_arr = mem_alloc(
+ 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ * sizeof(void*));
+}
+
+/********************************************************************
+Creates the doublewrite buffer at a database start. The header of the
+doublewrite buffer is placed on the trx system header page. */
+
+void
+trx_sys_create_doublewrite_buf(void)
+/*================================*/
+{
+ page_t* page;
+ page_t* page2;
+ page_t* new_page;
+ byte* doublewrite;
+ byte* fseg_header;
+ ulint page_no;
+ ulint prev_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ if (trx_doublewrite) {
+ /* Already inited */
+
+ return;
+ }
+
+start_again:
+ mtr_start(&mtr);
+
+ page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+ buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = page + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ trx_doublewrite_init(doublewrite);
+
+ mtr_commit(&mtr);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found: creating new\n");
+
+ if (buf_pool_get_curr_size() <
+ (2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ page2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_page_dbg_add_level(page2, SYNC_NO_ORDER_CHECK);
+
+ if (page2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ page_no = fseg_alloc_free_page(fseg_header,
+ prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (page_no == FIL_NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer: you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ new_page = buf_page_get(TRX_SYS_SPACE, page_no,
+ RW_X_LATCH, &mtr);
+ buf_page_dbg_add_level(new_page, SYNC_NO_ORDER_CHECK);
+
+ /* Make a dummy change to the page to ensure it will
+ be written to disk in a flush */
+
+ mlog_write_ulint(new_page + FIL_PAGE_DATA,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(ut_dulint_max, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+
+ goto start_again;
+ }
+}
+
+/********************************************************************
+At a database startup uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+
+void
+trx_sys_doublewrite_restore_corrupt_pages(void)
+/*===========================================*/
+{
+ byte* buf;
+ byte* read_buf;
+ byte* unaligned_read_buf;
+ ulint block1;
+ ulint block2;
+ byte* page;
+ byte* doublewrite;
+ ulint space_id;
+ ulint page_no;
+ ulint i;
+
+ /* We do the file i/o past the buffer pool */
+
+ unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+ read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
+
+ /* Read the trx sys header to check if we are using the
+ doublewrite buffer */
+
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+
+ doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has been created */
+
+ trx_doublewrite_init(doublewrite);
+
+ block1 = trx_doublewrite->block1;
+ block2 = trx_doublewrite->block2;
+
+ buf = trx_doublewrite->write_buf;
+ } else {
+ goto leave_func;
+ }
+
+ /* Read the pages from the doublewrite buffer to memory */
+
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf, NULL);
+ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block2, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ NULL);
+ /* Check if any of these pages is half-written in data files, in the
+ intended position */
+
+ page = buf;
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+ space_id = mach_read_from_4(page + FIL_PAGE_SPACE);
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ if (!fil_check_adress_in_tablespace(space_id, page_no)) {
+ fprintf(stderr,
+ "InnoDB: Warning: an inconsistent page in the doublewrite buffer\n"
+ "InnoDB: space id %lu page number %lu, %lu'th page in dblwr buf.\n",
+ space_id, page_no, i);
+
+ } else if (space_id == TRX_SYS_SPACE
+ && ( (page_no >= block1
+ && page_no
+ < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (page_no >= block2
+ && page_no
+ < block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+
+ /* It is an unwritten doublewrite buffer page:
+ do nothing */
+
+ } else {
+ /* Read in the actual page from the data files */
+
+ fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ /* Check if the page is corrupt */
+
+ if (buf_page_is_corrupted(read_buf)) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: database page corruption or a failed\n"
+ "InnoDB: file read of page %lu.\n", page_no);
+ fprintf(stderr,
+ "InnoDB: Trying to recover it from the doublewrite buffer.\n");
+
+ if (buf_page_is_corrupted(page)) {
+ fprintf(stderr,
+ "InnoDB: Also the page in the doublewrite buffer is corrupt.\n"
+ "InnoDB: Cannot continue operation.\n");
+ exit(1);
+ }
+
+ /* Write the good page from the
+ doublewrite buffer to the intended
+ position */
+
+ fil_io(OS_FILE_WRITE, TRUE, space_id,
+ page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ fprintf(stderr,
+ "InnoDB: Recovered the page from the doublewrite buffer.\n");
+ }
+ }
+
+ page += UNIV_PAGE_SIZE;
+ }
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+ ut_free(unaligned_read_buf);
+}
/********************************************************************
Checks that trx is in the trx list. */