summaryrefslogtreecommitdiff
path: root/innobase
diff options
context:
space:
mode:
Diffstat (limited to 'innobase')
-rw-r--r--innobase/btr/btr0btr.c48
-rw-r--r--innobase/btr/btr0cur.c61
-rw-r--r--innobase/btr/btr0pcur.c43
-rw-r--r--innobase/btr/btr0sea.c32
-rw-r--r--innobase/buf/buf0buf.c561
-rw-r--r--innobase/buf/buf0flu.c89
-rw-r--r--innobase/buf/buf0lru.c256
-rw-r--r--innobase/buf/buf0rea.c194
-rw-r--r--innobase/configure.in2
-rw-r--r--innobase/data/data0data.c18
-rw-r--r--innobase/data/data0type.c95
-rw-r--r--innobase/dict/dict0boot.c10
-rw-r--r--innobase/dict/dict0crea.c67
-rw-r--r--innobase/dict/dict0dict.c99
-rw-r--r--innobase/dict/dict0load.c152
-rw-r--r--innobase/dict/dict0mem.c2
-rw-r--r--innobase/fil/fil0fil.c3341
-rw-r--r--innobase/fsp/fsp0fsp.c539
-rw-r--r--innobase/fut/fut0lst.c5
-rw-r--r--innobase/ha/ha0ha.c74
-rw-r--r--innobase/ha/hash0hash.c1
-rw-r--r--innobase/ibuf/ibuf0ibuf.c918
-rw-r--r--innobase/include/btr0btr.ic12
-rw-r--r--innobase/include/btr0pcur.h3
-rw-r--r--innobase/include/btr0pcur.ic2
-rw-r--r--innobase/include/buf0buf.h156
-rw-r--r--innobase/include/buf0buf.ic132
-rw-r--r--innobase/include/buf0lru.h14
-rw-r--r--innobase/include/buf0rea.h13
-rw-r--r--innobase/include/data0type.h169
-rw-r--r--innobase/include/data0type.ic94
-rw-r--r--innobase/include/db0err.h5
-rw-r--r--innobase/include/dict0boot.h2
-rw-r--r--innobase/include/dict0dict.h24
-rw-r--r--innobase/include/dict0load.h16
-rw-r--r--innobase/include/dict0mem.h7
-rw-r--r--innobase/include/dyn0dyn.ic2
-rw-r--r--innobase/include/fil0fil.h505
-rw-r--r--innobase/include/fsp0fsp.h59
-rw-r--r--innobase/include/fut0lst.ic2
-rw-r--r--innobase/include/ha0ha.h2
-rw-r--r--innobase/include/ha0ha.ic43
-rw-r--r--innobase/include/hash0hash.h4
-rw-r--r--innobase/include/ibuf0ibuf.h34
-rw-r--r--innobase/include/ibuf0ibuf.ic2
-rw-r--r--innobase/include/lock0lock.h4
-rw-r--r--innobase/include/log0log.h33
-rw-r--r--innobase/include/log0log.ic8
-rw-r--r--innobase/include/log0recv.h25
-rw-r--r--innobase/include/mach0data.ic55
-rw-r--r--innobase/include/mem0pool.h2
-rw-r--r--innobase/include/mtr0log.h13
-rw-r--r--innobase/include/mtr0log.ic35
-rw-r--r--innobase/include/mtr0mtr.h8
-rw-r--r--innobase/include/os0file.h168
-rw-r--r--innobase/include/os0proc.h70
-rw-r--r--innobase/include/page0page.h3
-rw-r--r--innobase/include/que0types.h3
-rw-r--r--innobase/include/rem0rec.h6
-rw-r--r--innobase/include/rem0rec.ic30
-rw-r--r--innobase/include/row0mysql.h47
-rw-r--r--innobase/include/row0sel.h3
-rw-r--r--innobase/include/row0sel.ic2
-rw-r--r--innobase/include/row0upd.ic4
-rw-r--r--innobase/include/srv0srv.h9
-rw-r--r--innobase/include/srv0start.h9
-rw-r--r--innobase/include/sync0sync.h6
-rw-r--r--innobase/include/trx0rseg.ic4
-rw-r--r--innobase/include/trx0sys.h45
-rw-r--r--innobase/include/univ.i39
-rw-r--r--innobase/include/ut0byte.h2
-rw-r--r--innobase/include/ut0byte.ic8
-rw-r--r--innobase/include/ut0dbg.h15
-rw-r--r--innobase/include/ut0mem.h10
-rw-r--r--innobase/include/ut0ut.h53
-rw-r--r--innobase/include/ut0ut.ic4
-rw-r--r--innobase/lock/lock0lock.c58
-rw-r--r--innobase/log/log0log.c209
-rw-r--r--innobase/log/log0recv.c458
-rw-r--r--innobase/mach/mach0data.c16
-rw-r--r--innobase/mem/mem0dbg.c63
-rw-r--r--innobase/mem/mem0pool.c21
-rw-r--r--innobase/mtr/mtr0log.c12
-rw-r--r--innobase/mtr/mtr0mtr.c10
-rw-r--r--innobase/os/os0file.c784
-rw-r--r--innobase/os/os0proc.c462
-rw-r--r--innobase/os/os0sync.c6
-rw-r--r--innobase/page/page0cur.c11
-rw-r--r--innobase/page/page0page.c121
-rw-r--r--innobase/pars/lexyy.c2
-rw-r--r--innobase/pars/pars0opt.c5
-rw-r--r--innobase/que/que0que.c17
-rw-r--r--innobase/read/read0read.c16
-rw-r--r--innobase/rem/rem0cmp.c58
-rw-r--r--innobase/rem/rem0rec.c22
-rw-r--r--innobase/row/row0ins.c9
-rw-r--r--innobase/row/row0mysql.c440
-rw-r--r--innobase/row/row0purge.c10
-rw-r--r--innobase/row/row0sel.c172
-rw-r--r--innobase/row/row0uins.c7
-rw-r--r--innobase/row/row0umod.c7
-rw-r--r--innobase/row/row0undo.c3
-rw-r--r--innobase/row/row0upd.c3
-rw-r--r--innobase/srv/srv0srv.c145
-rw-r--r--innobase/srv/srv0start.c456
-rw-r--r--innobase/sync/sync0arr.c36
-rw-r--r--innobase/sync/sync0rw.c35
-rw-r--r--innobase/sync/sync0sync.c45
-rw-r--r--innobase/trx/trx0purge.c25
-rw-r--r--innobase/trx/trx0rec.c36
-rw-r--r--innobase/trx/trx0roll.c26
-rw-r--r--innobase/trx/trx0sys.c262
-rw-r--r--innobase/trx/trx0trx.c34
-rw-r--r--innobase/trx/trx0undo.c29
-rw-r--r--innobase/ut/ut0byte.c2
-rw-r--r--innobase/ut/ut0mem.c49
-rw-r--r--innobase/ut/ut0rnd.c3
-rw-r--r--innobase/ut/ut0ut.c272
118 files changed, 10370 insertions, 2719 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c
index 71be6d81d7c..ee27a171fa5 100644
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -430,7 +430,8 @@ btr_page_free_for_ibuf(
flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
- ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+ ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ mtr));
}
/******************************************************************
@@ -603,8 +604,8 @@ btr_page_get_father_for_rec(
"InnoDB: father ptr page no %lu, child page no %lu\n",
(UT_LIST_GET_FIRST(tree->tree_indexes))->table_name,
(UT_LIST_GET_FIRST(tree->tree_indexes))->name,
- btr_node_ptr_get_child_page_no(node_ptr),
- buf_frame_get_page_no(page));
+ (unsigned long) btr_node_ptr_get_child_page_no(node_ptr),
+ (unsigned long) buf_frame_get_page_no(page));
page_rec_print(page_rec_get_next(page_get_infimum_rec(page)));
page_rec_print(node_ptr);
@@ -885,7 +886,9 @@ btr_page_reorganize_low(
"InnoDB: Error: page old data size %lu new data size %lu\n"
"InnoDB: Error: page old max ins size %lu new max ins size %lu\n"
"InnoDB: Make a detailed bug report and send it to mysql@lists.mysql.com\n",
- data_size1, data_size2, max_ins_size1, max_ins_size2);
+ (unsigned long) data_size1, (unsigned long) data_size2,
+ (unsigned long) max_ins_size1,
+ (unsigned long) max_ins_size2);
}
buf_frame_free(new_page);
@@ -2225,7 +2228,8 @@ btr_print_recursive(
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
printf("NODE ON LEVEL %lu page number %lu\n",
- btr_page_get_level(page, mtr), buf_frame_get_page_no(page));
+ (ulong) btr_page_get_level(page, mtr),
+ (ulong) buf_frame_get_page_no(page));
page_print(page, width, width);
@@ -2366,8 +2370,10 @@ btr_index_rec_validate(
"InnoDB: Record in index %s in table %s, page %lu, at offset %lu\n"
"InnoDB: has %lu fields, should have %lu\n",
index->name, index->table_name,
- buf_frame_get_page_no(page), (ulint)(rec - page),
- rec_get_n_fields(rec), n);
+ (unsigned long) buf_frame_get_page_no(page),
+ (unsigned long) (rec - page),
+ (unsigned long) rec_get_n_fields(rec),
+ (unsigned long) n);
if (!dump_on_error) {
@@ -2400,9 +2406,11 @@ btr_index_rec_validate(
"InnoDB: Record in index %s in table %s, page %lu, at offset %lu\n"
"InnoDB: field %lu len is %lu, should be %lu\n",
index->name, index->table_name,
- buf_frame_get_page_no(page),
- (ulint)(rec - page),
- i, len, dtype_get_fixed_size(type));
+ (unsigned long) buf_frame_get_page_no(page),
+ (unsigned long) (rec - page),
+ (unsigned long) i,
+ (unsigned long) len,
+ (unsigned long) dtype_get_fixed_size(type));
if (!dump_on_error) {
@@ -2517,8 +2525,8 @@ loop:
if (!page_validate(page, index)) {
fprintf(stderr,
"InnoDB: Error in page %lu in index %s table %s, index tree level %lu\n",
- buf_frame_get_page_no(page), index->name,
- index->table_name, level);
+ (ulong) buf_frame_get_page_no(page), index->name,
+ index->table_name, (ulong) level);
ret = FALSE;
} else if (level == 0) {
@@ -2550,8 +2558,8 @@ loop:
fprintf(stderr,
"InnoDB: Error on pages %lu and %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
- right_page_no,
+ (ulong) buf_frame_get_page_no(page),
+ (ulong) right_page_no,
index->name, index->table_name);
fprintf(stderr,
@@ -2591,7 +2599,7 @@ loop:
&mtr)) {
fprintf(stderr,
"InnoDB: Error on page %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
+ (unsigned long) buf_frame_get_page_no(page),
index->name, index->table_name);
fprintf(stderr,
@@ -2606,7 +2614,7 @@ loop:
fprintf(stderr,
"InnoDB: node ptr child page n:o %lu\n",
- btr_node_ptr_get_child_page_no(node_ptr));
+ (unsigned long) btr_node_ptr_get_child_page_no(node_ptr));
rec_sprintf(err_buf, 900,
btr_page_get_father_for_rec(tree, page,
@@ -2634,7 +2642,7 @@ loop:
fprintf(stderr,
"InnoDB: Error on page %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
+ (ulong) buf_frame_get_page_no(page),
index->name, index->table_name);
buf_page_print(father_page);
@@ -2689,7 +2697,7 @@ loop:
fprintf(stderr,
"InnoDB: Error on page %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
+ (unsigned long) buf_frame_get_page_no(page),
index->name, index->table_name);
buf_page_print(father_page);
@@ -2709,7 +2717,7 @@ loop:
fprintf(stderr,
"InnoDB: Error on page %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
+ (unsigned long) buf_frame_get_page_no(page),
index->name, index->table_name);
buf_page_print(father_page);
@@ -2727,7 +2735,7 @@ loop:
fprintf(stderr,
"InnoDB: Error on page %lu in index %s table %s\n",
- buf_frame_get_page_no(page),
+ (unsigned long) buf_frame_get_page_no(page),
index->name, index->table_name);
buf_page_print(father_page);
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
index 6e1794c2ff7..af3a61041cb 100644
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -291,6 +291,7 @@ btr_cur_search_to_nth_level(
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
&& !estimate
&& mode != PAGE_CUR_LE_OR_EXTENDS
+ && srv_use_adaptive_hash_indexes
&& btr_search_guess_on_hash(index, info, tuple, mode,
latch_mode, cursor,
has_search_latch, mtr)) {
@@ -495,9 +496,11 @@ retry_page_get:
cursor->up_bytes = up_bytes;
#ifdef BTR_CUR_ADAPT
- btr_search_info_update(index, cursor);
-#endif
+ if (srv_use_adaptive_hash_indexes) {
+ btr_search_info_update(index, cursor);
+ }
+#endif
ut_ad(cursor->up_match != ULINT_UNDEFINED
|| mode != PAGE_CUR_GE);
ut_ad(cursor->up_match != ULINT_UNDEFINED
@@ -871,8 +874,8 @@ btr_cur_optimistic_insert(
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to insert to table %s index %s\n",
- ut_dulint_get_high(thr_get_trx(thr)->id),
- ut_dulint_get_low(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_high(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
dtuple_print(entry);
}
@@ -954,7 +957,7 @@ calculate_sizes_again:
/* Now, try the insert */
*rec = page_cur_insert_rec_low(page_cursor, entry, data_size,
- NULL, mtr);
+ NULL, mtr);
if (!(*rec)) {
/* If the record did not fit, reorganize */
btr_page_reorganize(page, mtr);
@@ -975,7 +978,8 @@ calculate_sizes_again:
fprintf(stderr,
"InnoDB: Error: cannot insert tuple %s to index %s of table %s\n"
"InnoDB: max insert size %lu\n",
- err_buf, index->name, index->table->name, max_size);
+ err_buf, index->name, index->table->name,
+ (unsigned long) max_size);
mem_free(err_buf);
}
@@ -1045,6 +1049,7 @@ btr_cur_pessimistic_insert(
ibool dummy_inh;
ibool success;
ulint n_extents = 0;
+ ulint n_reserved;
ut_ad(dtuple_check_typed(entry));
@@ -1064,7 +1069,7 @@ btr_cur_pessimistic_insert(
cursor->flag = BTR_CUR_BINARY;
err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec,
- thr, mtr);
+ thr, mtr);
if (err != DB_FAIL) {
return(err);
@@ -1087,7 +1092,7 @@ btr_cur_pessimistic_insert(
n_extents = cursor->tree_height / 16 + 3;
- success = fsp_reserve_free_extents(index->space,
+ success = fsp_reserve_free_extents(&n_reserved, index->space,
n_extents, FSP_NORMAL, mtr);
if (!success) {
err = DB_OUT_OF_FILE_SPACE;
@@ -1109,7 +1114,7 @@ btr_cur_pessimistic_insert(
if (n_extents > 0) {
fil_space_release_free_extents(index->space,
- n_extents);
+ n_reserved);
}
return(DB_TOO_BIG_RECORD);
}
@@ -1137,7 +1142,7 @@ btr_cur_pessimistic_insert(
err = DB_SUCCESS;
if (n_extents > 0) {
- fil_space_release_free_extents(index->space, n_extents);
+ fil_space_release_free_extents(index->space, n_reserved);
}
*big_rec = big_rec_vec;
@@ -1318,7 +1323,8 @@ btr_cur_parse_update_in_place(
}
/*****************************************************************
-Updates a record when the update causes no size changes in its fields. */
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change. */
ulint
btr_cur_update_in_place(
@@ -1349,8 +1355,8 @@ btr_cur_update_in_place(
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to update table %s index %s\n",
- ut_dulint_get_high(thr_get_trx(thr)->id),
- ut_dulint_get_low(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_high(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
@@ -1453,8 +1459,8 @@ btr_cur_optimistic_update(
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to update table %s index %s\n",
- ut_dulint_get_high(thr_get_trx(thr)->id),
- ut_dulint_get_low(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_high(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
@@ -1666,6 +1672,7 @@ btr_cur_pessimistic_update(
ibool was_first;
ibool success;
ulint n_extents = 0;
+ ulint n_reserved;
ulint* ext_vect;
ulint n_ext_vect;
ulint reserve_flag;
@@ -1711,7 +1718,8 @@ btr_cur_pessimistic_update(
reserve_flag = FSP_NORMAL;
}
- success = fsp_reserve_free_extents(cursor->index->space,
+ success = fsp_reserve_free_extents(&n_reserved,
+ cursor->index->space,
n_extents, reserve_flag, mtr);
if (!success) {
err = DB_OUT_OF_FILE_SPACE;
@@ -1860,7 +1868,7 @@ return_after_reservations:
if (n_extents > 0) {
fil_space_release_free_extents(cursor->index->space,
- n_extents);
+ n_reserved);
}
*big_rec = big_rec_vec;
@@ -2000,8 +2008,8 @@ btr_cur_del_mark_set_clust_rec(
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to del mark table %s index %s\n",
- ut_dulint_get_high(thr_get_trx(thr)->id),
- ut_dulint_get_low(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_high(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
@@ -2140,8 +2148,8 @@ btr_cur_del_mark_set_sec_rec(
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to del mark table %s index %s\n",
- ut_dulint_get_high(thr_get_trx(thr)->id),
- ut_dulint_get_low(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_high(thr_get_trx(thr)->id),
+ (unsigned long) ut_dulint_get_low(thr_get_trx(thr)->id),
cursor->index->table_name, cursor->index->name);
rec_print(rec);
}
@@ -2331,6 +2339,7 @@ btr_cur_pessimistic_delete(
rec_t* rec;
dtuple_t* node_ptr;
ulint n_extents = 0;
+ ulint n_reserved;
ibool success;
ibool ret = FALSE;
mem_heap_t* heap;
@@ -2349,7 +2358,8 @@ btr_cur_pessimistic_delete(
n_extents = cursor->tree_height / 32 + 1;
- success = fsp_reserve_free_extents(cursor->index->space,
+ success = fsp_reserve_free_extents(&n_reserved,
+ cursor->index->space,
n_extents, FSP_CLEANING, mtr);
if (!success) {
*err = DB_OUT_OF_FILE_SPACE;
@@ -2428,7 +2438,8 @@ return_after_reservations:
}
if (n_extents > 0) {
- fil_space_release_free_extents(cursor->index->space, n_extents);
+ fil_space_release_free_extents(cursor->index->space,
+ n_reserved);
}
return(ret);
@@ -3101,7 +3112,7 @@ btr_store_big_rec_extern_fields(
ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree),
MTR_MEMO_X_LOCK));
ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec),
- MTR_MEMO_PAGE_X_FIX));
+ MTR_MEMO_PAGE_X_FIX));
ut_a(index->type & DICT_CLUSTERED);
space_id = buf_frame_get_space_id(rec);
@@ -3269,7 +3280,7 @@ btr_free_externally_stored_field(
ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree),
MTR_MEMO_X_LOCK));
ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data),
- MTR_MEMO_PAGE_X_FIX));
+ MTR_MEMO_PAGE_X_FIX));
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c
index 4725551d4d7..cf8a612ef28 100644
--- a/innobase/btr/btr0pcur.c
+++ b/innobase/btr/btr0pcur.c
@@ -95,7 +95,9 @@ btr_pcur_store_position(
ut_a(cursor->latch_mode != BTR_NO_LATCHES);
if (page_get_n_recs(page) == 0) {
- /* It must be an empty index tree */
+ /* It must be an empty index tree; NOTE that in this case
+ we do not store the modify_clock, but always do a search
+ if we restore the cursor position */
ut_a(btr_page_get_next(page, mtr) == FIL_NULL
&& btr_page_get_prev(page, mtr) == FIL_NULL);
@@ -128,12 +130,13 @@ btr_pcur_store_position(
} else {
cursor->rel_pos = BTR_PCUR_ON;
}
-
+
cursor->old_stored = BTR_PCUR_OLD_STORED;
cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec,
&(cursor->old_rec_buf),
&(cursor->buf_size));
+ cursor->block_when_stored = buf_block_align(page);
cursor->modify_clock = buf_frame_get_modify_clock(page);
}
@@ -205,6 +208,9 @@ btr_pcur_restore_position(
if (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
|| cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+ /* In these cases we do not try an optimistic restoration,
+ but always do a search */
+
if (cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
from_left = TRUE;
} else {
@@ -214,6 +220,10 @@ btr_pcur_restore_position(
btr_cur_open_at_index_side(from_left,
btr_pcur_get_btr_cur(cursor)->index, latch_mode,
btr_pcur_get_btr_cur(cursor), mtr);
+
+ cursor->block_when_stored =
+ buf_block_align(btr_pcur_get_page(cursor));
+
return(FALSE);
}
@@ -224,8 +234,9 @@ btr_pcur_restore_position(
if (latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF) {
/* Try optimistic restoration */
- if (buf_page_optimistic_get(latch_mode, page,
- cursor->modify_clock, mtr)) {
+ if (buf_page_optimistic_get(latch_mode,
+ cursor->block_when_stored, page,
+ cursor->modify_clock, mtr)) {
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(page, SYNC_TREE_NODE);
@@ -270,8 +281,6 @@ btr_pcur_restore_position(
btr_pcur_open_with_no_init(btr_pcur_get_btr_cur(cursor)->index, tuple,
mode, latch_mode, cursor, 0, mtr);
-
- cursor->old_stored = BTR_PCUR_OLD_STORED;
/* Restore the old search mode */
cursor->search_mode = old_mode;
@@ -280,11 +289,18 @@ btr_pcur_restore_position(
&& btr_pcur_is_on_user_rec(cursor, mtr)
&& 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) {
- /* We have to store the NEW value for the modify clock, since
- the cursor can now be on a different page! */
+ /* We have to store the NEW value for the modify clock, since
+ the cursor can now be on a different page! But we can retain
+ the value of old_rec */
+
+ cursor->modify_clock =
+ buf_frame_get_modify_clock(btr_pcur_get_page(cursor));
+
+ cursor->block_when_stored =
+ buf_block_align(btr_pcur_get_page(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_STORED;
- cursor->modify_clock = buf_frame_get_modify_clock(
- buf_frame_align(btr_pcur_get_rec(cursor)));
mem_heap_free(heap);
return(TRUE);
@@ -292,9 +308,10 @@ btr_pcur_restore_position(
mem_heap_free(heap);
- /* We have to store position information, modify clock value, etc.
- because the cursor may now be on a different page */
-
+ /* We have to store new position information, modify_clock etc.,
+ to the cursor because it can now be on a different page, the record
+ under it may have been removed, etc. */
+
btr_pcur_store_position(cursor, mtr);
return(FALSE);
diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
index 9421ca48718..238f118e260 100644
--- a/innobase/btr/btr0sea.c
+++ b/innobase/btr/btr0sea.c
@@ -790,8 +790,8 @@ btr_search_guess_on_hash(
goto failure;
}
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
- ut_ad(page_rec_is_user_rec(rec));
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(page_rec_is_user_rec(rec));
btr_cur_position(index, rec, cursor);
@@ -1040,12 +1040,14 @@ btr_search_drop_page_hash_when_freed(
mtr_start(&mtr);
- /* We assume that if the caller has a latch on the page,
- then the caller has already dropped the hash index for the page,
- and we never get here. Therefore we can acquire the s-latch to
- the page without fearing a deadlock. */
+ /* We assume that if the caller has a latch on the page, then the
+ caller has already dropped the hash index for the page, and we never
+ get here. Therefore we can acquire the s-latch to the page without
+ having to fear a deadlock. */
- page = buf_page_get(space, page_no, RW_S_LATCH, &mtr);
+ page = buf_page_get_gen(space, page_no, RW_S_LATCH, NULL,
+ BUF_GET_IF_IN_POOL, IB__FILE__, __LINE__,
+ &mtr);
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(page, SYNC_TREE_NODE_FROM_HASH);
@@ -1563,11 +1565,12 @@ btr_search_validate(void)
fprintf(stderr,
" InnoDB: Error in an adaptive hash index pointer to page %lu\n"
"ptr mem address %lu index id %lu %lu, node fold %lu, rec fold %lu\n",
- buf_frame_get_page_no(page),
- (ulint)(node->data),
- ut_dulint_get_high(btr_page_get_index_id(page)),
- ut_dulint_get_low(btr_page_get_index_id(page)),
- node->fold, rec_fold((rec_t*)(node->data),
+ (ulong) buf_frame_get_page_no(page),
+ (ulong)(node->data),
+ (ulong) ut_dulint_get_high(btr_page_get_index_id(page)),
+ (ulong) ut_dulint_get_low(btr_page_get_index_id(page)),
+ (ulong) node->fold,
+ (ulong) rec_fold((rec_t*)(node->data),
block->curr_n_fields,
block->curr_n_bytes,
btr_page_get_index_id(page)));
@@ -1581,8 +1584,9 @@ btr_search_validate(void)
fprintf(stderr,
"Page mem address %lu, is hashed %lu, n fields %lu, n bytes %lu\n"
"side %lu\n",
- (ulint)page, block->is_hashed, block->curr_n_fields,
- block->curr_n_bytes, block->curr_side);
+ (ulong) page, (ulong) block->is_hashed,
+ (ulong) block->curr_n_fields,
+ (ulong) block->curr_n_bytes, (ulong) block->curr_side);
if (n_page_dumps < 20) {
buf_page_print(page);
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index e2661725912..844880238fa 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -197,7 +197,29 @@ If a new page is referenced in the buf_pool, and several pages
of its random access area (for instance, 32 consecutive pages
in a tablespace) have recently been referenced, we may predict
that the whole area may be needed in the near future, and issue
-the read requests for the whole area. */
+the read requests for the whole area.
+
+ AWE implementation
+ ------------------
+
+By a 'block' we mean the buffer header of type buf_block_t. By a 'page'
+we mean the physical 16 kB memory area allocated from RAM for that block.
+By a 'frame' we mean a 16 kB area in the virtual address space of the
+process, in the frame_mem of buf_pool.
+
+We can map pages to the frames of the buffer pool.
+
+1) A buffer block allocated to use as a non-data page, e.g., to the lock
+table, is always mapped to a frame.
+2) A bufferfixed or io-fixed data page is always mapped to a frame.
+3) When we need to map a block to frame, we look from the list
+awe_LRU_free_mapped and try to unmap its last block, but note that
+bufferfixed or io-fixed pages cannot be unmapped.
+4) For every frame in the buffer pool there is always a block whose page is
+mapped to it. When we create the buffer pool, we map the first elements
+in the free list to the frames.
+5) When we have AWE enabled, we disable adaptive hash indexes.
+*/
buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */
@@ -221,9 +243,10 @@ buf_calc_page_new_checksum(
{
ulint checksum;
- /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO
- are written outside the buffer pool to the first pages of data
- files, we have to skip them in the page checksum calculation.
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ ..._ARCH_LOG_NO, are written outside the buffer pool to the first
+ pages of data files, we have to skip them in the page checksum
+ calculation.
We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
checksum is stored, and also the last 8 bytes of page because
there we store the old formula checksum. */
@@ -233,7 +256,7 @@ buf_calc_page_new_checksum(
+ ut_fold_binary(page + FIL_PAGE_DATA,
UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN_OLD_CHKSUM);
- checksum = checksum & 0xFFFFFFFF;
+ checksum = checksum & 0xFFFFFFFFUL;
return(checksum);
}
@@ -256,7 +279,7 @@ buf_calc_page_old_checksum(
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
- checksum = checksum & 0xFFFFFFFF;
+ checksum = checksum & 0xFFFFFFFFUL;
return(checksum);
}
@@ -274,8 +297,9 @@ buf_page_is_corrupted(
ulint old_checksum;
ulint checksum_field;
ulint old_checksum_field;
+#ifndef UNIV_HOTBACKUP
dulint current_lsn;
-
+#endif
if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
@@ -297,13 +321,13 @@ buf_page_is_corrupted(
" InnoDB: Error: page %lu log sequence number %lu %lu\n"
"InnoDB: is in the future! Current system log sequence number %lu %lu.\n"
"InnoDB: Your database may be corrupt.\n",
- mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
- ut_dulint_get_high(
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ (ulong) ut_dulint_get_high(
mach_read_from_8(read_buf + FIL_PAGE_LSN)),
- ut_dulint_get_low(
+ (ulong) ut_dulint_get_low(
mach_read_from_8(read_buf + FIL_PAGE_LSN)),
- ut_dulint_get_high(current_lsn),
- ut_dulint_get_low(current_lsn));
+ (ulong) ut_dulint_get_high(current_lsn),
+ (ulong) ut_dulint_get_low(current_lsn));
}
}
#endif
@@ -356,8 +380,8 @@ buf_page_print(
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Page dump in ascii and hex (%lu bytes):\n%s",
- (ulint)UNIV_PAGE_SIZE, buf);
+" InnoDB: Page dump in ascii and hex (%lu bytes):\n%s",
+ (ulong) UNIV_PAGE_SIZE, buf);
fprintf(stderr, "InnoDB: End of page dump\n");
mem_free(buf);
@@ -369,16 +393,21 @@ buf_page_print(
fprintf(stderr,
" InnoDB: Page checksum %lu, prior-to-4.0.14-form checksum %lu\n"
"InnoDB: stored checksum %lu, prior-to-4.0.14-form stored checksum %lu\n",
- checksum, old_checksum,
- mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
- mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ (ulong) checksum, (ulong) old_checksum,
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM));
fprintf(stderr,
- "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n",
- mach_read_from_4(read_buf + FIL_PAGE_LSN),
- mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
- mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n"
+"InnoDB: Page number (if stored to page already) %lu,\n"
+"InnoDB: space id (if created with >= MySQL-4.1.1 and stored already) %lu\n",
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+ (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+
if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
== TRX_UNDO_INSERT) {
fprintf(stderr,
@@ -392,12 +421,9 @@ buf_page_print(
if (fil_page_get_type(read_buf) == FIL_PAGE_INDEX) {
fprintf(stderr,
- "InnoDB: Page may be an index page ");
-
- fprintf(stderr,
- "where index id is %lu %lu\n",
- ut_dulint_get_high(btr_page_get_index_id(read_buf)),
- ut_dulint_get_low(btr_page_get_index_id(read_buf)));
+"InnoDB: Page may be an index page where index id is %lu %lu\n",
+ (ulong) ut_dulint_get_high(btr_page_get_index_id(read_buf)),
+ (ulong) ut_dulint_get_low(btr_page_get_index_id(read_buf)));
/* If the code is in ibbackup, dict_sys may be uninitialized,
i.e., NULL */
@@ -413,7 +439,6 @@ buf_page_print(
index->name);
}
}
-
} else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) {
fprintf(stderr, "InnoDB: Page may be an 'inode' page\n");
} else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) {
@@ -429,23 +454,29 @@ void
buf_block_init(
/*===========*/
buf_block_t* block, /* in: pointer to control block */
- byte* frame) /* in: pointer to buffer frame */
+ byte* frame) /* in: pointer to buffer frame, or NULL if in
+ the case of AWE there is no frame */
{
block->state = BUF_BLOCK_NOT_USED;
block->frame = frame;
+ block->awe_info = NULL;
+
block->modify_clock = ut_dulint_zero;
block->file_page_was_freed = FALSE;
block->check_index_page_at_flush = FALSE;
+ block->in_free_list = FALSE;
+ block->in_LRU_list = FALSE;
+
+ block->n_pointers = 0;
+
rw_lock_create(&(block->lock));
ut_ad(rw_lock_validate(&(block->lock)));
- rw_lock_create(&(block->read_lock));
- rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK);
#ifdef UNIV_SYNC_DEBUG
rw_lock_create(&(block->debug_latch));
rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK);
@@ -453,25 +484,40 @@ buf_block_init(
}
/************************************************************************
-Creates a buffer buf_pool object. */
-static
+Creates the buffer pool. */
+
buf_pool_t*
-buf_pool_create(
-/*============*/
+buf_pool_init(
+/*==========*/
/* out, own: buf_pool object, NULL if not
- enough memory */
+ enough memory or error */
ulint max_size, /* in: maximum size of the buf_pool in
blocks */
- ulint curr_size) /* in: current size to use, must be <=
+ ulint curr_size, /* in: current size to use, must be <=
max_size, currently must be equal to
max_size */
+ ulint n_frames) /* in: number of frames; if AWE is used,
+ this is the size of the address space window
+ where physical memory pages are mapped; if
+ AWE is not used then this must be the same
+ as max_size */
{
byte* frame;
ulint i;
buf_block_t* block;
ut_a(max_size == curr_size);
+ ut_a(srv_use_awe || n_frames == max_size);
+ if (n_frames > curr_size) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: you must specify in my.cnf .._awe_mem_mb larger\n"
+"InnoDB: than .._buffer_pool_size. Now the former is %lu pages,\n"
+"InnoDB: the latter %lu pages.\n", (ulong) curr_size, (ulong) n_frames);
+
+ return(NULL);
+ }
+
buf_pool = mem_alloc(sizeof(buf_pool_t));
/* 1. Initialize general fields
@@ -480,8 +526,38 @@ buf_pool_create(
mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL);
mutex_enter(&(buf_pool->mutex));
-
- buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1));
+
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ /* Allocate the virtual address space window, i.e., the
+ buffer pool frames */
+
+ buf_pool->frame_mem = os_awe_allocate_virtual_mem_window(
+ UNIV_PAGE_SIZE * (n_frames + 1));
+
+ /* Allocate the physical memory for AWE and the AWE info array
+ for buf_pool */
+
+ if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) {
+
+ fprintf(stderr,
+"InnoDB: AWE: Error: physical memory must be allocated in full megabytes.\n"
+"InnoDB: Trying to allocate %lu database pages.\n",
+ (ulong) curr_size);
+
+ return(NULL);
+ }
+
+ if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info),
+ curr_size / ((1024 * 1024) / UNIV_PAGE_SIZE))) {
+
+ return(NULL);
+ }
+ /*----------------------------------------*/
+ } else {
+ buf_pool->frame_mem = ut_malloc(
+ UNIV_PAGE_SIZE * (n_frames + 1));
+ }
if (buf_pool->frame_mem == NULL) {
@@ -498,21 +574,60 @@ buf_pool_create(
buf_pool->max_size = max_size;
buf_pool->curr_size = curr_size;
+ buf_pool->n_frames = n_frames;
+
/* Align pointer to the first frame */
frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE);
+
buf_pool->frame_zero = frame;
+ buf_pool->high_end = frame + UNIV_PAGE_SIZE * n_frames;
+
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ /* Map an initial part of the allocated physical memory to
+ the window */
+
+ os_awe_map_physical_mem_to_window(buf_pool->frame_zero,
+ n_frames *
+ (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE),
+ buf_pool->awe_info);
+ /*----------------------------------------*/
+ }
- buf_pool->high_end = frame + UNIV_PAGE_SIZE * curr_size;
+ buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames);
+
+ if (buf_pool->blocks_of_frames == NULL) {
+
+ return(NULL);
+ }
+
+ /* Init block structs and assign frames for them; in the case of
+ AWE there are less frames than blocks. Then we assign the frames
+ to the first blocks (we already mapped the memory above). We also
+ init the awe_info for every block. */
- /* Init block structs and assign frames for them */
for (i = 0; i < max_size; i++) {
block = buf_pool_get_nth_block(buf_pool, i);
+
+ if (i < n_frames) {
+ frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE;
+ *(buf_pool->blocks_of_frames + i) = block;
+ } else {
+ frame = NULL;
+ }
+
buf_block_init(block, frame);
- frame = frame + UNIV_PAGE_SIZE;
+
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ block->awe_info = buf_pool->awe_info
+ + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE);
+ /*----------------------------------------*/
+ }
}
-
+
buf_pool->page_hash = hash_create(2 * max_size);
buf_pool->n_pend_reads = 0;
@@ -522,12 +637,14 @@ buf_pool_create(
buf_pool->n_pages_read = 0;
buf_pool->n_pages_written = 0;
buf_pool->n_pages_created = 0;
-
+ buf_pool->n_pages_awe_remapped = 0;
+
buf_pool->n_page_gets = 0;
buf_pool->n_page_gets_old = 0;
buf_pool->n_pages_read_old = 0;
buf_pool->n_pages_written_old = 0;
buf_pool->n_pages_created_old = 0;
+ buf_pool->n_pages_awe_remapped_old = 0;
/* 2. Initialize flushing fields
---------------------------- */
@@ -550,37 +667,122 @@ buf_pool_create(
buf_pool->LRU_old = NULL;
+ UT_LIST_INIT(buf_pool->awe_LRU_free_mapped);
+
/* Add control blocks to the free list */
UT_LIST_INIT(buf_pool->free);
+
for (i = 0; i < curr_size; i++) {
block = buf_pool_get_nth_block(buf_pool, i);
- UT_LIST_ADD_FIRST(free, buf_pool->free, block);
+ if (block->frame) {
+ /* Wipe contents of frame to eliminate a Purify
+ warning */
+
+#ifdef HAVE_purify
+ memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#endif
+ if (srv_use_awe) {
+ /* Add to the list of blocks mapped to
+ frames */
+
+ UT_LIST_ADD_LAST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+ }
+
+ UT_LIST_ADD_LAST(free, buf_pool->free, block);
+ block->in_free_list = TRUE;
}
mutex_exit(&(buf_pool->mutex));
- btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+ if (srv_use_adaptive_hash_indexes) {
+ btr_search_sys_create(
+ curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+ } else {
+ /* Create only a small dummy system */
+ btr_search_sys_create(1000);
+ }
return(buf_pool);
}
/************************************************************************
-Initializes the buffer buf_pool of the database. */
+Maps the page of block to a frame, if not mapped yet. Unmaps some page
+from the end of the awe_LRU_free_mapped. */
void
-buf_pool_init(
-/*==========*/
- ulint max_size, /* in: maximum size of the buf_pool in blocks */
- ulint curr_size) /* in: current size to use, must be <=
- max_size */
+buf_awe_map_page_to_frame(
+/*======================*/
+ buf_block_t* block, /* in: block whose page should be
+ mapped to a frame */
+ ibool add_to_mapped_list) /* in: TRUE if we in the case
+ we need to map the page should also
+ add the block to the
+ awe_LRU_free_mapped list */
{
- ut_a(buf_pool == NULL);
+ buf_block_t* bck;
- buf_pool_create(max_size, curr_size);
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(block);
+
+ if (block->frame) {
+
+ return;
+ }
- ut_ad(buf_validate());
+ /* Scan awe_LRU_free_mapped from the end and try to find a block
+ which is not bufferfixed or io-fixed */
+
+ bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);
+
+ while (bck) {
+ if (bck->state == BUF_BLOCK_FILE_PAGE
+ && (bck->buf_fix_count != 0 || bck->io_fix != 0)) {
+
+ /* We have to skip this */
+ bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
+ } else {
+ /* We can map block to the frame of bck */
+
+ os_awe_map_physical_mem_to_window(
+ bck->frame,
+ UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE,
+ block->awe_info);
+
+ block->frame = bck->frame;
+
+ *(buf_pool->blocks_of_frames
+ + (((ulint)(block->frame
+ - buf_pool->frame_zero))
+ >> UNIV_PAGE_SIZE_SHIFT))
+ = block;
+
+ bck->frame = NULL;
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped,
+ bck);
+
+ if (add_to_mapped_list) {
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped,
+ block);
+ }
+
+ buf_pool->n_pages_awe_remapped++;
+
+ return;
+ }
+ }
+
+ fprintf(stderr,
+"InnoDB: AWE: Fatal error: cannot find a page to unmap\n"
+"InnoDB: awe_LRU_free_mapped list length %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+
+ ut_a(0);
}
/************************************************************************
@@ -589,7 +791,9 @@ UNIV_INLINE
buf_block_t*
buf_block_alloc(void)
/*=================*/
- /* out, own: the allocated block */
+ /* out, own: the allocated block; also if AWE
+ is used it is guaranteed that the page is
+ mapped to a frame */
{
buf_block_t* block;
@@ -633,7 +837,7 @@ buf_page_make_young(
block = buf_block_align(frame);
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
buf_LRU_make_block_young(block);
@@ -648,7 +852,7 @@ buf_block_free(
/*===========*/
buf_block_t* block) /* in, own: block to be freed */
{
- ut_ad(block->state != BUF_BLOCK_FILE_PAGE);
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
mutex_enter(&(buf_pool->mutex));
@@ -912,6 +1116,8 @@ loop:
goto loop;
}
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
must_read = FALSE;
if (block->io_fix == BUF_IO_READ) {
@@ -927,6 +1133,19 @@ loop:
}
}
+ /* If AWE is enabled and the page is not mapped to a frame, then
+ map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is in the
+ LRU list and we must put it to awe_LRU_free_mapped list once
+ mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
#ifdef UNIV_SYNC_DEBUG
buf_block_buf_fix_inc_debug(block, file, line);
#else
@@ -981,8 +1200,26 @@ loop:
} else if (rw_latch == RW_NO_LATCH) {
if (must_read) {
- rw_lock_x_lock(&(block->read_lock));
- rw_lock_x_unlock(&(block->read_lock));
+ /* Let us wait until the read operation
+ completes */
+
+ for (;;) {
+ mutex_enter(&(buf_pool->mutex));
+
+ if (block->io_fix == BUF_IO_READ) {
+
+ mutex_exit(&(buf_pool->mutex));
+
+ /* Sleep 20 milliseconds */
+
+ os_thread_sleep(20000);
+ } else {
+
+ mutex_exit(&(buf_pool->mutex));
+
+ break;
+ }
+ }
}
fix_type = MTR_MEMO_BUF_FIX;
@@ -1021,28 +1258,27 @@ buf_page_optimistic_get_func(
/*=========================*/
/* out: TRUE if success */
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
- buf_frame_t* guess, /* in: guessed frame */
+ buf_block_t* block, /* in: guessed buffer block */
+ buf_frame_t* guess, /* in: guessed frame; note that AWE may move
+ frames */
dulint modify_clock,/* in: modify clock value if mode is
..._GUESS_ON_CLOCK */
char* file, /* in: file name */
ulint line, /* in: line where called */
mtr_t* mtr) /* in: mini-transaction */
{
- buf_block_t* block;
ibool accessed;
ibool success;
ulint fix_type;
- ut_ad(mtr && guess);
+ ut_ad(mtr && block);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
-
- buf_pool->n_page_gets++;
-
- block = buf_block_align(guess);
mutex_enter(&(buf_pool->mutex));
- if (block->state != BUF_BLOCK_FILE_PAGE) {
+ /* If AWE is used, block may have a different frame now, e.g., NULL */
+
+ if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) {
mutex_exit(&(buf_pool->mutex));
@@ -1135,12 +1371,15 @@ buf_page_optimistic_get_func(
#ifdef UNIV_IBUF_DEBUG
ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
+ buf_pool->n_page_gets++;
+
return(TRUE);
}
/************************************************************************
This is used to get access to a known database page, when no waiting can be
-done. */
+done. For example, if a search in an adaptive hash index leads us to this
+frame. */
ibool
buf_page_get_known_nowait(
@@ -1159,13 +1398,11 @@ buf_page_get_known_nowait(
ut_ad(mtr);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
-
- buf_pool->n_page_gets++;
-
- block = buf_block_align(guess);
mutex_enter(&(buf_pool->mutex));
+ block = buf_block_align(guess);
+
if (block->state == BUF_BLOCK_REMOVE_HASH) {
/* Another thread is just freeing the block from the LRU list
of the buffer pool: do not try to access this page; this
@@ -1179,6 +1416,8 @@ buf_page_get_known_nowait(
return(FALSE);
}
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
#ifdef UNIV_SYNC_DEBUG
buf_block_buf_fix_inc_debug(block, file, line);
#else
@@ -1233,6 +1472,8 @@ buf_page_get_known_nowait(
ut_a((mode == BUF_KEEP_OLD)
|| (ibuf_count_get(block->space, block->offset) == 0));
#endif
+ buf_pool->n_page_gets++;
+
return(TRUE);
}
@@ -1289,7 +1530,7 @@ buf_page_init(
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
- ut_ad(block->state == BUF_BLOCK_READY_FOR_USE);
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
/* Set the state of the block */
block->magic_n = BUF_BLOCK_MAGIC_N;
@@ -1305,6 +1546,19 @@ buf_page_init(
/* Insert into the hash table of file pages */
+ if (buf_page_hash_get(space, offset)) {
+ fprintf(stderr,
+"InnoDB: Error: page %lu %lu already found from the hash table\n",
+ (ulong) space,
+ (ulong) offset);
+ buf_print();
+ buf_LRU_print();
+ buf_validate();
+ buf_LRU_validate();
+
+ ut_a(0);
+ }
+
HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
buf_page_address_fold(space, offset), block);
@@ -1328,25 +1582,35 @@ buf_page_init(
/************************************************************************
Function which inits a page for read to the buffer buf_pool. If the page is
-already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and
-sets a non-recursive exclusive lock on the buffer frame. The io-handler must
-take care that the flag is cleared and the lock released later. This is one
-of the functions which perform the state transition NOT_USED => FILE_PAGE to
-a block (the other is buf_page_create). */
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later. This is one of the functions which perform the
+state transition NOT_USED => FILE_PAGE to a block (the other is
+buf_page_create). */
buf_block_t*
buf_page_init_for_read(
/*===================*/
- /* out: pointer to the block or NULL */
- ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
- ulint space, /* in: space id */
- ulint offset) /* in: page number */
+ /* out: pointer to the block or NULL */
+ ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /* in: space id */
+ ib_longlong tablespace_version,/* in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset) /* in: page number */
{
buf_block_t* block;
mtr_t mtr;
-
+
ut_ad(buf_pool);
+ *err = DB_SUCCESS;
+
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
/* It is a read-ahead within an ibuf routine */
@@ -1367,13 +1631,20 @@ buf_page_init_for_read(
block = buf_block_alloc();
- ut_ad(block);
+ ut_a(block);
mutex_enter(&(buf_pool->mutex));
-
- if (NULL != buf_page_hash_get(space, offset)) {
- /* The page is already in buf_pool, return */
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(space,
+ tablespace_version)) {
+ *err = DB_TABLESPACE_DELETED;
+ }
+
+ if (*err == DB_TABLESPACE_DELETED
+ || NULL != buf_page_hash_get(space, offset)) {
+
+ /* The page belongs to a space which has been deleted or is
+ being deleted, or the page is already in buf_pool, return */
mutex_exit(&(buf_pool->mutex));
buf_block_free(block);
@@ -1405,8 +1676,6 @@ buf_page_init_for_read(
is completed. The x-lock is cleared by the io-handler thread. */
rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ);
-
- rw_lock_x_lock_gen(&(block->read_lock), BUF_IO_READ);
mutex_exit(&(buf_pool->mutex));
@@ -1464,8 +1733,9 @@ buf_page_create(
/* If we get here, the page was not in buf_pool: init it there */
if (buf_debug_prints) {
- printf("Creating space %lu page %lu to buffer\n", space,
- offset);
+ printf("Creating space %lu page %lu to buffer\n",
+ (ulong) space,
+ (ulong) offset);
}
block = free_block;
@@ -1491,7 +1761,7 @@ buf_page_create(
/* Delete possible entries for the page from the insert buffer:
such can exist if the page belonged to an index which was dropped */
- ibuf_merge_or_delete_for_page(NULL, space, offset);
+ ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE);
/* Flush pages from the end of the LRU list if necessary */
buf_flush_free_margin();
@@ -1526,6 +1796,8 @@ buf_page_io_complete(
ut_ad(block);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
@@ -1541,7 +1813,7 @@ buf_page_io_complete(
fprintf(stderr,
"InnoDB: Error: page n:o stored in the page read in is %lu, should be %lu!\n",
- read_page_no, block->offset);
+ (ulong) read_page_no, (ulong) block->offset);
}
/* From version 3.23.38 up we store the page checksum
to the 4 first bytes of the page end lsn field */
@@ -1549,7 +1821,7 @@ buf_page_io_complete(
if (buf_page_is_corrupted(block->frame)) {
fprintf(stderr,
"InnoDB: Database page corruption on disk or a failed\n"
- "InnoDB: file read of page %lu.\n", block->offset);
+ "InnoDB: file read of page %lu.\n", (ulong) block->offset);
fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
@@ -1558,7 +1830,7 @@ buf_page_io_complete(
fprintf(stderr,
"InnoDB: Database page corruption on disk or a failed\n"
- "InnoDB: file read of page %lu.\n", block->offset);
+ "InnoDB: file read of page %lu.\n", (ulong) block->offset);
fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
fprintf(stderr,
@@ -1589,7 +1861,7 @@ buf_page_io_complete(
if (!recv_no_ibuf_operations) {
ibuf_merge_or_delete_for_page(block->frame,
- block->space, block->offset);
+ block->space, block->offset, TRUE);
}
}
@@ -1614,9 +1886,7 @@ buf_page_io_complete(
buf_pool->n_pend_reads--;
buf_pool->n_pages_read++;
-
rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ);
- rw_lock_x_unlock_gen(&(block->read_lock), BUF_IO_READ);
if (buf_debug_prints) {
printf("Has read ");
@@ -1641,8 +1911,8 @@ buf_page_io_complete(
mutex_exit(&(buf_pool->mutex));
if (buf_debug_prints) {
- printf("page space %lu page no %lu", block->space,
- block->offset);
+ printf("page space %lu page no %lu", (ulong) block->space,
+ (ulong) block->offset);
id = btr_page_get_index_id(block->frame);
index = NULL;
@@ -1757,14 +2027,16 @@ buf_validate(void)
}
if (n_lru + n_free > buf_pool->curr_size) {
- printf("n LRU %lu, n free %lu\n", n_lru, n_free);
+ printf("n LRU %lu, n free %lu\n", (ulong) n_lru,
+ (ulong) n_free);
ut_error;
}
ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
printf("Free list len %lu, free blocks %lu\n",
- UT_LIST_GET_LEN(buf_pool->free), n_free);
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) n_free);
ut_error;
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
@@ -1800,29 +2072,30 @@ buf_print(void)
ut_ad(buf_pool);
- size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+ size = buf_pool->curr_size;
index_ids = mem_alloc(sizeof(dulint) * size);
counts = mem_alloc(sizeof(ulint) * size);
mutex_enter(&(buf_pool->mutex));
- printf("buf_pool size %lu \n", size);
- printf("database pages %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
- printf("free pages %lu \n", UT_LIST_GET_LEN(buf_pool->free));
+ printf("buf_pool size %lu \n", (ulong) size);
+ printf("database pages %lu \n", (ulong) UT_LIST_GET_LEN(buf_pool->LRU));
+ printf("free pages %lu \n", (ulong) UT_LIST_GET_LEN(buf_pool->free));
printf("modified database pages %lu \n",
- UT_LIST_GET_LEN(buf_pool->flush_list));
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list));
- printf("n pending reads %lu \n", buf_pool->n_pend_reads);
+ printf("n pending reads %lu \n", (ulong) buf_pool->n_pend_reads);
printf("n pending flush LRU %lu list %lu single page %lu\n",
- buf_pool->n_flush[BUF_FLUSH_LRU],
- buf_pool->n_flush[BUF_FLUSH_LIST],
- buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
printf("pages read %lu, created %lu, written %lu\n",
- buf_pool->n_pages_read, buf_pool->n_pages_created,
- buf_pool->n_pages_written);
+ (ulong) buf_pool->n_pages_read,
+ (ulong) buf_pool->n_pages_created,
+ (ulong) buf_pool->n_pages_written);
/* Count the number of blocks belonging to each index in the buffer */
@@ -1866,7 +2139,8 @@ buf_print(void)
index = dict_index_get_if_in_cache(index_ids[i]);
printf("Block count for index %lu in buffer is about %lu",
- ut_dulint_get_low(index_ids[i]), counts[i]);
+ (ulong) ut_dulint_get_low(index_ids[i]),
+ (ulong) counts[i]);
if (index) {
printf(" index name %s table %s", index->name,
@@ -1938,35 +2212,44 @@ buf_print_io(
return;
}
- size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+ size = buf_pool->curr_size;
mutex_enter(&(buf_pool->mutex));
buf += sprintf(buf,
- "Buffer pool size %lu\n", size);
+ "Buffer pool size %lu\n", (ulong) size);
buf += sprintf(buf,
- "Free buffers %lu\n", UT_LIST_GET_LEN(buf_pool->free));
+ "Free buffers %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->free));
buf += sprintf(buf,
- "Database pages %lu\n", UT_LIST_GET_LEN(buf_pool->LRU));
+ "Database pages %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->LRU));
/*
buf += sprintf(buf,
- "Lock heap buffers %lu\n", buf_pool->n_lock_heap_pages);
+ "Lock heap buffers %lu\n", (ulong) buf_pool->n_lock_heap_pages);
buf += sprintf(buf,
- "Hash index buffers %lu\n", buf_pool->n_adaptive_hash_pages);
+ "Hash index buffers %lu\n", (ulong) buf_pool->n_adaptive_hash_pages);
*/
buf += sprintf(buf,
"Modified db pages %lu\n",
- UT_LIST_GET_LEN(buf_pool->flush_list));
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list));
+ if (srv_use_awe) {
+ buf += sprintf(buf,
+ "AWE: Buffer pool memory frames %lu\n",
+ (ulong) buf_pool->n_frames);
+
+ buf += sprintf(buf,
+ "AWE: Database pages and free buffers mapped in frames %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
- buf += sprintf(buf, "Pending reads %lu \n", buf_pool->n_pend_reads);
+ buf += sprintf(buf, "Pending reads %lu \n", (ulong) buf_pool->n_pend_reads);
buf += sprintf(buf,
"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
- buf_pool->n_flush[BUF_FLUSH_LRU]
- + buf_pool->init_flush[BUF_FLUSH_LRU],
- buf_pool->n_flush[BUF_FLUSH_LIST]
- + buf_pool->init_flush[BUF_FLUSH_LIST],
- buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+ (ulong) (buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->init_flush[BUF_FLUSH_LRU]),
+ (ulong) (buf_pool->n_flush[BUF_FLUSH_LIST]
+ + buf_pool->init_flush[BUF_FLUSH_LIST]),
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
@@ -1974,8 +2257,9 @@ buf_print_io(
buf_pool->last_printout_time = current_time;
buf += sprintf(buf, "Pages read %lu, created %lu, written %lu\n",
- buf_pool->n_pages_read, buf_pool->n_pages_created,
- buf_pool->n_pages_written);
+ (ulong) buf_pool->n_pages_read,
+ (ulong) buf_pool->n_pages_created,
+ (ulong) buf_pool->n_pages_written);
buf += sprintf(buf, "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
(buf_pool->n_pages_read - buf_pool->n_pages_read_old)
/ time_elapsed,
@@ -1984,12 +2268,19 @@ buf_print_io(
(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
/ time_elapsed);
+ if (srv_use_awe) {
+ buf += sprintf(buf, "AWE: %.2f page remaps/s\n",
+ (buf_pool->n_pages_awe_remapped
+ - buf_pool->n_pages_awe_remapped_old)
+ / time_elapsed);
+ }
+
if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
buf += sprintf(buf, "Buffer pool hit rate %lu / 1000\n",
- 1000
+ (ulong) (1000
- ((1000 *
(buf_pool->n_pages_read - buf_pool->n_pages_read_old))
- / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)));
+ / (buf_pool->n_page_gets - buf_pool->n_page_gets_old))));
} else {
buf += sprintf(buf,
"No buffer pool page gets since the last printout\n");
@@ -1999,6 +2290,7 @@ buf_print_io(
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
+ buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
mutex_exit(&(buf_pool->mutex));
}
@@ -2015,6 +2307,7 @@ buf_refresh_io_stats(void)
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
+ buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
}
/*************************************************************************
@@ -2039,7 +2332,7 @@ buf_all_freed(void)
if (!buf_flush_ready_for_replace(block)) {
- /* printf("Page %lu %lu still fixed or dirty\n",
+ /* printf("Page %lu %lu still fixed or dirty\n",
block->space, block->offset); */
ut_error;
}
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 66c9bb605dc..c568d5925fa 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -24,6 +24,7 @@ Created 11/11/1995 Heikki Tuuri
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
+#include "srv0srv.h"
/* When flushed, dirty blocks are searched in neigborhoods of this size, and
flushed along with the original page. */
@@ -51,6 +52,8 @@ buf_flush_insert_into_flush_list(
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
|| (ut_dulint_cmp(
(UT_LIST_GET_FIRST(buf_pool->flush_list))
@@ -107,7 +110,7 @@ buf_flush_ready_for_replace(
/*========================*/
/* out: TRUE if can replace immediately */
buf_block_t* block) /* in: buffer control block, must be in state
- BUF_BLOCK_FILE_PAGE and in the LRU list*/
+ BUF_BLOCK_FILE_PAGE and in the LRU list */
{
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
@@ -138,11 +141,10 @@ buf_flush_ready_for_flush(
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
&& (block->io_fix == 0)) {
-
if (flush_type != BUF_FLUSH_LRU) {
return(TRUE);
@@ -172,6 +174,8 @@ buf_flush_write_complete(
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
block->oldest_modification = ut_dulint_zero;
UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
@@ -251,7 +255,7 @@ buf_flush_buffered_writes(void)
"InnoDB: to be written to data file. We intentionally crash server\n"
"InnoDB: to prevent corrupt data from ending up in data\n"
"InnoDB: files.\n",
- block->offset, block->space);
+ (ulong) block->offset, (ulong) block->space);
ut_error;
}
@@ -291,6 +295,8 @@ buf_flush_buffered_writes(void)
for (i = 0; i < trx_doublewrite->first_free; i++) {
block = trx_doublewrite->buf_block_arr[i];
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
@@ -330,6 +336,8 @@ buf_flush_post_to_doublewrite_buf(
try_again:
mutex_enter(&(trx_doublewrite->mutex));
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
@@ -370,16 +378,15 @@ buf_flush_init_for_writing(
ulint space, /* in: space id */
ulint page_no) /* in: page number */
{
- UT_NOT_USED(space);
-
/* Write the newest modification lsn to the page header and trailer */
mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
newest_lsn);
- /* Write the page number */
+ /* Write the page number and the space id */
mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
/* Store the new formula checksum */
@@ -405,6 +412,8 @@ buf_flush_write_block_low(
/*======================*/
buf_block_t* block) /* in: buffer block to write */
{
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
#ifdef UNIV_IBUF_DEBUG
ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
@@ -453,12 +462,26 @@ buf_flush_try_page(
block = buf_page_hash_get(space, offset);
- ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
if (flush_type == BUF_FLUSH_LIST
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[flush_type] == 0) {
@@ -490,7 +513,8 @@ buf_flush_try_page(
if (buf_debug_prints) {
printf("Flushing page space %lu, page no %lu \n",
- block->space, block->offset);
+ (ulong) block->space,
+ (ulong) block->offset);
}
buf_flush_write_block_low(block);
@@ -509,6 +533,20 @@ buf_flush_try_page(
..._ready_for_flush). */
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[flush_type] == 0) {
@@ -534,6 +572,20 @@ buf_flush_try_page(
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[block->flush_type] == 0) {
@@ -550,7 +602,8 @@ buf_flush_try_page(
if (buf_debug_prints) {
printf(
"Flushing single page space %lu, page no %lu \n",
- block->space, block->offset);
+ (ulong) block->space,
+ (ulong) block->offset);
}
buf_flush_write_block_low(block);
@@ -603,6 +656,7 @@ buf_flush_try_neighbors(
for (i = low; i < high; i++) {
block = buf_page_hash_get(space, i);
+ ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
if (block && flush_type == BUF_FLUSH_LRU && i != offset
&& !block->old) {
@@ -671,10 +725,10 @@ buf_flush_batch(
ulint offset;
ibool found;
- ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST));
- ut_ad((flush_type != BUF_FLUSH_LIST) ||
- sync_thread_levels_empty_gen(TRUE));
-
+ ut_ad((flush_type == BUF_FLUSH_LRU)
+ || (flush_type == BUF_FLUSH_LIST));
+ ut_ad((flush_type != BUF_FLUSH_LIST)
+ || sync_thread_levels_empty_gen(TRUE));
mutex_enter(&(buf_pool->mutex));
if ((buf_pool->n_flush[flush_type] > 0)
@@ -705,7 +759,6 @@ buf_flush_batch(
ut_ad(flush_type == BUF_FLUSH_LIST);
block = UT_LIST_GET_LAST(buf_pool->flush_list);
-
if (!block
|| (ut_dulint_cmp(block->oldest_modification,
lsn_limit) >= 0)) {
@@ -724,6 +777,7 @@ buf_flush_batch(
function a pointer to a block in the list! */
while ((block != NULL) && !found) {
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
if (buf_flush_ready_for_flush(block, flush_type)) {
@@ -749,7 +803,6 @@ buf_flush_batch(
} else if (flush_type == BUF_FLUSH_LRU) {
block = UT_LIST_GET_PREV(LRU, block);
-
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
@@ -781,10 +834,10 @@ buf_flush_batch(
if (buf_debug_prints && page_count > 0) {
if (flush_type == BUF_FLUSH_LRU) {
printf("Flushed %lu pages in LRU flush\n",
- page_count);
+ (ulong) page_count);
} else if (flush_type == BUF_FLUSH_LIST) {
printf("Flushed %lu pages in flush list flush\n",
- page_count);
+ (ulong) page_count);
} else {
ut_error;
}
diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c
index 0128ee87871..c5faec17890 100644
--- a/innobase/buf/buf0lru.c
+++ b/innobase/buf/buf0lru.c
@@ -62,6 +62,90 @@ buf_LRU_block_free_hashed_page(
be in a state where it can be freed */
/**********************************************************************
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. */
+
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+ ulint id) /* in: space id */
+{
+ buf_block_t* block;
+ ulint page_no;
+ ibool all_freed;
+
+scan_again:
+ mutex_enter(&(buf_pool->mutex));
+
+ all_freed = TRUE;
+
+ block = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while (block != NULL) {
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
+ if (block->space == id
+ && (block->buf_fix_count > 0 || block->io_fix != 0)) {
+
+ /* We cannot remove this page during this scan yet;
+ maybe the system is currently reading it in, or
+ flushing the modifications to the file */
+
+ all_freed = FALSE;
+
+ goto next_page;
+ }
+
+ if (block->space == id) {
+ if (buf_debug_prints) {
+ printf(
+ "Dropping space %lu page %lu\n",
+ (ulong) block->space,
+ (ulong) block->offset);
+ }
+
+ if (block->is_hashed) {
+ page_no = block->offset;
+
+ mutex_exit(&(buf_pool->mutex));
+
+ /* Note that the following call will acquire
+ an S-latch on the page */
+
+ btr_search_drop_page_hash_when_freed(id,
+ page_no);
+ goto scan_again;
+ }
+
+ if (0 != ut_dulint_cmp(block->oldest_modification,
+ ut_dulint_zero)) {
+
+ /* Remove from the flush list of modified
+ blocks */
+ block->oldest_modification = ut_dulint_zero;
+
+ UT_LIST_REMOVE(flush_list,
+ buf_pool->flush_list, block);
+ }
+
+ /* Remove from the LRU list */
+ buf_LRU_block_remove_hashed_page(block);
+ buf_LRU_block_free_hashed_page(block);
+ }
+next_page:
+ block = UT_LIST_GET_PREV(LRU, block);
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+
+ if (!all_freed) {
+ os_thread_sleep(20000);
+
+ goto scan_again;
+ }
+}
+
+/**********************************************************************
Gets the minimum LRU_position field for the blocks in an initial segment
(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not
guaranteed to be precise, because the ulint_clock may wrap around. */
@@ -118,43 +202,43 @@ buf_LRU_search_and_free_block(
mutex_enter(&(buf_pool->mutex));
freed = FALSE;
-
block = UT_LIST_GET_LAST(buf_pool->LRU);
while (block != NULL) {
-
+ ut_a(block->in_LRU_list);
if (buf_flush_ready_for_replace(block)) {
-
if (buf_debug_prints) {
printf(
"Putting space %lu page %lu to free list\n",
- block->space, block->offset);
+ (ulong) block->space,
+ (ulong) block->offset);
}
-
buf_LRU_block_remove_hashed_page(block);
mutex_exit(&(buf_pool->mutex));
- btr_search_drop_page_hash_index(block->frame);
-
+ /* Remove possible adaptive hash index built on the
+ page; in the case of AWE the block may not have a
+ frame at all */
+
+ if (block->frame) {
+ btr_search_drop_page_hash_index(block->frame);
+ }
mutex_enter(&(buf_pool->mutex));
ut_a(block->buf_fix_count == 0);
buf_LRU_block_free_hashed_page(block);
-
freed = TRUE;
break;
}
-
block = UT_LIST_GET_PREV(LRU, block);
distance++;
if (!freed && n_iterations <= 10
&& distance > 100 + (n_iterations * buf_pool->curr_size)
/ 10) {
-
buf_pool->LRU_flush_ended = 0;
mutex_exit(&(buf_pool->mutex));
@@ -162,15 +246,12 @@ buf_LRU_search_and_free_block(
return(FALSE);
}
}
-
if (buf_pool->LRU_flush_ended > 0) {
buf_pool->LRU_flush_ended--;
}
-
- if (!freed) {
+ if (!freed) {
buf_pool->LRU_flush_ended = 0;
}
-
mutex_exit(&(buf_pool->mutex));
return(freed);
@@ -211,7 +292,9 @@ list. */
buf_block_t*
buf_LRU_get_free_block(void)
/*========================*/
- /* out: the free control block */
+ /* out: the free control block; also if AWE is
+ used, it is guaranteed that the block has its
+ page mapped to a frame when we return */
{
buf_block_t* block = NULL;
ibool freed;
@@ -254,7 +337,7 @@ loop:
"InnoDB: the buffer pool bigger?\n"
"InnoDB: Starting the InnoDB Monitor to print diagnostics, including\n"
"InnoDB: lock heap and hash index sizes.\n",
- (ulong)(buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE)));
+ (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE)));
srv_print_innodb_monitor = TRUE;
os_event_set(srv_lock_timeout_thread_event);
@@ -273,7 +356,27 @@ loop:
if (UT_LIST_GET_LEN(buf_pool->free) > 0) {
block = UT_LIST_GET_FIRST(buf_pool->free);
+ ut_a(block->in_free_list);
UT_LIST_REMOVE(free, buf_pool->free, block);
+ block->in_free_list = FALSE;
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
+ ut_a(!block->in_LRU_list);
+
+ if (srv_use_awe) {
+ if (block->frame) {
+ /* Remove from the list of mapped pages */
+
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ } else {
+ /* We map the page to a frame; second param
+ FALSE below because we do not want it to be
+ added to the awe_LRU_free_mapped list */
+
+ buf_awe_map_page_to_frame(block, FALSE);
+ }
+ }
+
block->state = BUF_BLOCK_READY_FOR_USE;
mutex_exit(&(buf_pool->mutex));
@@ -302,7 +405,7 @@ loop:
"InnoDB: Warning: difficult to find free blocks from\n"
"InnoDB: the buffer pool (%lu search iterations)! Consider\n"
"InnoDB: increasing the buffer pool size.\n",
- n_iterations);
+ (ulong) n_iterations);
fprintf(stderr,
"InnoDB: It is also possible that in your Unix version\n"
"InnoDB: fsync is very slow, or completely frozen inside\n"
@@ -312,11 +415,13 @@ loop:
fprintf(stderr,
"InnoDB: Pending flushes (fsync) log: %lu; buffer pool: %lu\n",
- fil_n_pending_log_flushes,
- fil_n_pending_tablespace_flushes);
+ (ulong) fil_n_pending_log_flushes,
+ (ulong) fil_n_pending_tablespace_flushes);
fprintf(stderr,
"InnoDB: %lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
- os_n_file_reads, os_n_file_writes, os_n_fsyncs);
+ (ulong) os_n_file_reads,
+ (ulong) os_n_file_writes,
+ (ulong) os_n_fsyncs);
fprintf(stderr,
"InnoDB: Starting InnoDB Monitor to print further\n"
@@ -369,7 +474,7 @@ buf_LRU_old_adjust_len(void)
ulint old_len;
ulint new_len;
- ut_ad(buf_pool->LRU_old);
+ ut_a(buf_pool->LRU_old);
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -379,6 +484,8 @@ buf_LRU_old_adjust_len(void)
old_len = buf_pool->LRU_old_len;
new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8);
+ ut_a(buf_pool->LRU_old->in_LRU_list);
+
/* Update the LRU_old pointer if necessary */
if (old_len < new_len - BUF_LRU_OLD_TOLERANCE) {
@@ -395,7 +502,7 @@ buf_LRU_old_adjust_len(void)
buf_pool->LRU_old);
buf_pool->LRU_old_len--;
} else {
- ut_ad(buf_pool->LRU_old); /* Check that we did not
+ ut_a(buf_pool->LRU_old); /* Check that we did not
fall out of the LRU list */
return;
}
@@ -403,9 +510,8 @@ buf_LRU_old_adjust_len(void)
}
/***********************************************************************
-Initializes the old blocks pointer in the LRU list.
-This function should be called when the LRU list grows to
-BUF_LRU_OLD_MIN_LEN length. */
+Initializes the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
static
void
buf_LRU_old_init(void)
@@ -413,7 +519,7 @@ buf_LRU_old_init(void)
{
buf_block_t* block;
- ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
+ ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
/* We first initialize all blocks in the LRU list as old and then use
the adjust function to move the LRU_old pointer to the right
@@ -422,6 +528,8 @@ buf_LRU_old_init(void)
block = UT_LIST_GET_FIRST(buf_pool->LRU);
while (block != NULL) {
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->in_LRU_list);
block->old = TRUE;
block = UT_LIST_GET_NEXT(LRU, block);
}
@@ -446,6 +554,9 @@ buf_LRU_remove_block(
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->in_LRU_list);
+
/* If the LRU_old pointer is defined and points to just this block,
move it backward one step */
@@ -459,11 +570,19 @@ buf_LRU_remove_block(
(buf_pool->LRU_old)->old = TRUE;
buf_pool->LRU_old_len++;
- ut_ad(buf_pool->LRU_old);
+ ut_a(buf_pool->LRU_old);
}
/* Remove the block from the LRU list */
UT_LIST_REMOVE(LRU, buf_pool->LRU, block);
+ block->in_LRU_list = FALSE;
+
+ if (srv_use_awe && block->frame) {
+ /* Remove from the list of mapped pages */
+
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
/* If the LRU list is so short that LRU_old not defined, return */
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
@@ -501,6 +620,8 @@ buf_LRU_add_block_to_end_low(
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
block->old = TRUE;
last_block = UT_LIST_GET_LAST(buf_pool->LRU);
@@ -511,8 +632,17 @@ buf_LRU_add_block_to_end_low(
block->LRU_position = buf_pool_clock_tic();
}
+ ut_a(!block->in_LRU_list);
UT_LIST_ADD_LAST(LRU, buf_pool->LRU, block);
+ block->in_LRU_list = TRUE;
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages */
+
+ UT_LIST_ADD_LAST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+
if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
buf_pool->LRU_old_len++;
@@ -555,9 +685,21 @@ buf_LRU_add_block_low(
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(!block->in_LRU_list);
+
block->old = old;
cl = buf_pool_clock_tic();
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages; for simplicity we always
+ add to the start, even if the user would have set 'old'
+ TRUE */
+
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+
if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block);
@@ -575,6 +717,8 @@ buf_LRU_add_block_low(
block->LRU_position = (buf_pool->LRU_old)->LRU_position;
}
+ block->in_LRU_list = TRUE;
+
if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
ut_ad(buf_pool->LRU_old);
@@ -645,9 +789,12 @@ buf_LRU_block_free_non_file_page(
#endif /* UNIV_SYNC_DEBUG */
ut_ad(block);
- ut_ad((block->state == BUF_BLOCK_MEMORY)
+ ut_a((block->state == BUF_BLOCK_MEMORY)
|| (block->state == BUF_BLOCK_READY_FOR_USE));
+ ut_a(block->n_pointers == 0);
+ ut_a(!block->in_free_list);
+
block->state = BUF_BLOCK_NOT_USED;
#ifdef UNIV_DEBUG
@@ -655,6 +802,14 @@ buf_LRU_block_free_non_file_page(
memset(block->frame, '\0', UNIV_PAGE_SIZE);
#endif
UT_LIST_ADD_FIRST(free, buf_pool->free, block);
+ block->in_free_list = TRUE;
+
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages */
+
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
}
/**********************************************************************
@@ -673,8 +828,7 @@ buf_LRU_block_remove_hashed_page(
#endif /* UNIV_SYNC_DEBUG */
ut_ad(block);
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
-
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_a(block->io_fix == 0);
ut_a(block->buf_fix_count == 0);
ut_a(ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) == 0);
@@ -683,7 +837,31 @@ buf_LRU_block_remove_hashed_page(
buf_pool->freed_page_clock += 1;
- buf_frame_modify_clock_inc(block->frame);
+ /* Note that if AWE is enabled the block may not have a frame at all */
+
+ buf_block_modify_clock_inc(block);
+
+ if (block != buf_page_hash_get(block->space, block->offset)) {
+ fprintf(stderr,
+"InnoDB: Error: page %lu %lu not found from the hash table\n",
+ (ulong) block->space,
+ (ulong) block->offset);
+ if (buf_page_hash_get(block->space, block->offset)) {
+ fprintf(stderr,
+"InnoDB: From hash table we find block %lx of %lu %lu which is not %lx\n",
+ (ulong) buf_page_hash_get(block->space, block->offset),
+ (ulong) buf_page_hash_get(block->space, block->offset)->space,
+ (ulong) buf_page_hash_get(block->space, block->offset)->offset,
+ (ulong) block);
+ }
+
+ buf_print();
+ buf_LRU_print();
+ buf_validate();
+ buf_LRU_validate();
+
+ ut_a(0);
+ }
HASH_DELETE(buf_block_t, hash, buf_pool->page_hash,
buf_page_address_fold(block->space, block->offset),
@@ -704,7 +882,7 @@ buf_LRU_block_free_hashed_page(
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
- ut_ad(block->state == BUF_BLOCK_REMOVE_HASH);
+ ut_a(block->state == BUF_BLOCK_REMOVE_HASH);
block->state = BUF_BLOCK_MEMORY;
@@ -797,7 +975,7 @@ buf_LRU_print(void)
ut_ad(buf_pool);
mutex_enter(&(buf_pool->mutex));
- printf("Pool ulint clock %lu\n", buf_pool->ulint_clock);
+ printf("Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock);
block = UT_LIST_GET_FIRST(buf_pool->LRU);
@@ -805,18 +983,18 @@ buf_LRU_print(void)
while (block != NULL) {
- printf("BLOCK %lu ", block->offset);
+ printf("BLOCK %lu ", (ulong) block->offset);
if (block->old) {
printf("old ");
}
if (block->buf_fix_count) {
- printf("buffix count %lu ", block->buf_fix_count);
+ printf("buffix count %lu ", (ulong) block->buf_fix_count);
}
if (block->io_fix) {
- printf("io_fix %lu ", block->io_fix);
+ printf("io_fix %lu ", (ulong) block->io_fix);
}
if (ut_dulint_cmp(block->oldest_modification,
@@ -824,12 +1002,12 @@ buf_LRU_print(void)
printf("modif. ");
}
- printf("LRU pos %lu ", block->LRU_position);
+ printf("LRU pos %lu ", (ulong) block->LRU_position);
frame = buf_block_get_frame(block);
- printf("type %lu ", fil_page_get_type(frame));
- printf("index id %lu ", ut_dulint_get_low(
+ printf("type %lu ", (ulong) fil_page_get_type(frame));
+ printf("index id %lu ", (ulong) ut_dulint_get_low(
btr_page_get_index_id(frame)));
block = UT_LIST_GET_NEXT(LRU, block);
diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
index 475a5bd9cbd..5ba27b8fee8 100644
--- a/innobase/buf/buf0rea.c
+++ b/innobase/buf/buf0rea.c
@@ -49,19 +49,30 @@ ulint
buf_read_page_low(
/*==============*/
/* out: 1 if a read request was queued, 0 if the page
- already resided in buf_pool or if the page is in
+ already resided in buf_pool, or if the page is in
the doublewrite buffer blocks in which case it is never
- read into the pool */
+ read into the pool, or if the tablespace does not
+ exist or is being dropped */
+ ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+ trying to read from a non-existent tablespace, or a
+ tablespace which is just now being dropped */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
at read-ahead functions) */
ulint space, /* in: space id */
+ ib_longlong tablespace_version, /* in: if the space memory object has
+ this timestamp different from what we are giving here,
+ treat the tablespace as dropped; this is a timestamp we
+ use to stop dangling page reads from a tablespace
+ which we have DISCARDed + IMPORTed back */
ulint offset) /* in: page number */
{
buf_block_t* block;
ulint wake_later;
+ *err = DB_SUCCESS;
+
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
@@ -72,6 +83,11 @@ buf_read_page_low(
|| (offset >= trx_doublewrite->block2
&& offset < trx_doublewrite->block2
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: trying to read doublewrite buffer page %lu\n",
+ (ulong) offset);
+
return(0);
}
@@ -97,27 +113,39 @@ buf_read_page_low(
sync = TRUE;
}
- block = buf_page_init_for_read(mode, space, offset);
-
- if (block != NULL) {
- if (buf_debug_prints) {
- printf("Posting read request for page %lu, sync %lu\n",
- offset, sync);
- }
+ /* The following call will also check if the tablespace does not exist
+ or is being dropped; if we succeed in initing the page in the buffer
+ pool for read, then DISCARD cannot proceed until the read has
+ completed */
- fil_io(OS_FILE_READ | wake_later,
- sync, space, offset, 0, UNIV_PAGE_SIZE,
- (void*)block->frame, (void*)block);
- if (sync) {
- /* The i/o is already completed when we arrive from
- fil_read */
- buf_page_io_complete(block);
- }
+ block = buf_page_init_for_read(err, mode, space, tablespace_version,
+ offset);
+ if (block == NULL) {
- return(1);
+ return(0);
}
- return(0);
+ if (buf_debug_prints) {
+ printf("Posting read request for page %lu, sync %lu\n",
+ (ulong) offset,
+ (ulong) sync);
+ }
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
+ *err = fil_io(OS_FILE_READ | wake_later,
+ sync, space,
+ offset, 0, UNIV_PAGE_SIZE,
+ (void*)block->frame, (void*)block);
+ ut_a(*err == DB_SUCCESS);
+
+ if (sync) {
+ /* The i/o is already completed when we arrive from
+ fil_read */
+ buf_page_io_complete(block);
+ }
+
+ return(1);
}
/************************************************************************
@@ -142,12 +170,14 @@ buf_read_ahead_random(
ulint offset) /* in: page number of a page which the current thread
wants to access */
{
+ ib_longlong tablespace_version;
buf_block_t* block;
ulint recent_blocks = 0;
ulint count;
ulint LRU_recent_limit;
ulint ibuf_mode;
ulint low, high;
+ ulint err;
ulint i;
if (srv_startup_is_before_trx_rollback_phase) {
@@ -164,11 +194,16 @@ buf_read_ahead_random(
return(0);
}
+ /* Remember the tablespace version before we ask te tablespace size
+ below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+ do not try to read outside the bounds of the tablespace! */
+
+ tablespace_version = fil_space_get_version(space);
+
low = (offset / BUF_READ_AHEAD_RANDOM_AREA)
* BUF_READ_AHEAD_RANDOM_AREA;
high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1)
* BUF_READ_AHEAD_RANDOM_AREA;
-
if (high > fil_space_get_size(space)) {
high = fil_space_get_size(space);
@@ -193,7 +228,6 @@ buf_read_ahead_random(
that is, reside near the start of the LRU list. */
for (i = low; i < high; i++) {
-
block = buf_page_hash_get(space, i);
if ((block)
@@ -227,10 +261,17 @@ buf_read_ahead_random(
mode: hence FALSE as the first parameter */
if (!ibuf_bitmap_page(i)) {
-
- count += buf_read_page_low(FALSE, ibuf_mode
+ count += buf_read_page_low(&err, FALSE, ibuf_mode
| OS_AIO_SIMULATED_WAKE_LATER,
- space, i);
+ space, tablespace_version, i);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: in random readahead trying to access tablespace\n"
+"InnoDB: %lu page no. %lu,\n"
+"InnoDB: but the tablespace does not exist or is just being dropped.\n",
+ (ulong) space, (ulong) i);
+ }
}
}
@@ -243,7 +284,8 @@ buf_read_ahead_random(
if (buf_debug_prints && (count > 0)) {
printf("Random read-ahead space %lu offset %lu pages %lu\n",
- space, offset, count);
+ (ulong) space, (ulong) offset,
+ (ulong) count);
}
return(count);
@@ -264,15 +306,27 @@ buf_read_page(
ulint space, /* in: space id */
ulint offset) /* in: page number */
{
- ulint count;
- ulint count2;
+ ib_longlong tablespace_version;
+ ulint count;
+ ulint count2;
+ ulint err;
+
+ tablespace_version = fil_space_get_version(space);
count = buf_read_ahead_random(space, offset);
/* We do the i/o in the synchronous aio mode to save thread
switches: hence TRUE */
- count2 = buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, offset);
+ count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ tablespace_version, offset);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: error: trying to access tablespace %lu page no. %lu,\n"
+"InnoDB: but the tablespace does not exist or is just being dropped.\n",
+ (ulong) space, (ulong) offset);
+ }
/* Flush pages from the end of the LRU list if necessary */
buf_flush_free_margin();
@@ -312,6 +366,7 @@ buf_read_ahead_linear(
ulint offset) /* in: page number of a page; NOTE: the current thread
must want access to this page (see NOTE 3 above) */
{
+ ib_longlong tablespace_version;
buf_block_t* block;
buf_frame_t* frame;
buf_block_t* pred_block = NULL;
@@ -323,6 +378,7 @@ buf_read_ahead_linear(
ulint fail_count;
ulint ibuf_mode;
ulint low, high;
+ ulint err;
ulint i;
if (srv_startup_is_before_trx_rollback_phase) {
@@ -350,14 +406,21 @@ buf_read_ahead_linear(
return(0);
}
+ /* Remember the tablespace version before we ask te tablespace size
+ below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+ do not try to read outside the bounds of the tablespace! */
+
+ tablespace_version = fil_space_get_version(space);
+
+ mutex_enter(&(buf_pool->mutex));
+
if (high > fil_space_get_size(space)) {
+ mutex_exit(&(buf_pool->mutex));
/* The area is not whole, return */
return(0);
}
- mutex_enter(&(buf_pool->mutex));
-
if (buf_pool->n_pend_reads >
buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
mutex_exit(&(buf_pool->mutex));
@@ -378,18 +441,15 @@ buf_read_ahead_linear(
fail_count = 0;
for (i = low; i < high; i++) {
-
block = buf_page_hash_get(space, i);
if ((block == NULL) || !block->accessed) {
-
/* Not accessed */
fail_count++;
} else if (pred_block && (ut_ulint_cmp(block->LRU_position,
pred_block->LRU_position)
!= asc_or_desc)) {
-
/* Accesses not in the right order */
fail_count++;
@@ -462,7 +522,7 @@ buf_read_ahead_linear(
return(0);
}
- /* If we got this far, read-ahead can be sensible: do it */
+ /* If we got this far, read-ahead can be sensible: do it */
if (ibuf_inside()) {
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
@@ -483,9 +543,17 @@ buf_read_ahead_linear(
aio mode: hence FALSE as the first parameter */
if (!ibuf_bitmap_page(i)) {
- count += buf_read_page_low(FALSE, ibuf_mode
+ count += buf_read_page_low(&err, FALSE, ibuf_mode
| OS_AIO_SIMULATED_WAKE_LATER,
- space, i);
+ space, tablespace_version, i);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: in linear readahead trying to access tablespace\n"
+"InnoDB: %lu page no. %lu,\n"
+"InnoDB: but the tablespace does not exist or is just being dropped.\n",
+ (ulong) space, (ulong) i);
+ }
}
}
@@ -501,7 +569,7 @@ buf_read_ahead_linear(
if (buf_debug_prints && (count > 0)) {
printf(
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
- space, offset, count);
+ (ulong) space, (ulong) offset, (ulong) count);
}
return(count);
@@ -509,7 +577,7 @@ buf_read_ahead_linear(
/************************************************************************
Issues read requests for pages which the ibuf module wants to read in, in
-order to contract insert buffer trees. Technically, this function is like
+order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
void
@@ -518,11 +586,17 @@ buf_read_ibuf_merge_pages(
ibool sync, /* in: TRUE if the caller wants this function
to wait for the highest address page to get
read in, before this function returns */
- ulint space, /* in: space id */
+ ulint* space_ids, /* in: array of space ids */
+ ib_longlong* space_versions,/* in: the spaces must have this version
+ number (timestamp), otherwise we discard the
+ read; we use this to cancel reads if
+ DISCARD + IMPORT may have changed the
+ tablespace size */
ulint* page_nos, /* in: array of page numbers to read, with the
highest page number the last in the array */
ulint n_stored) /* in: number of page numbers in the array */
{
+ ulint err;
ulint i;
ut_ad(!ibuf_inside());
@@ -536,11 +610,19 @@ buf_read_ibuf_merge_pages(
for (i = 0; i < n_stored; i++) {
if ((i + 1 == n_stored) && sync) {
- buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space,
- page_nos[i]);
+ buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE,
+ space_ids[i], space_versions[i], page_nos[i]);
} else {
- buf_read_page_low(FALSE, BUF_READ_ANY_PAGE, space,
- page_nos[i]);
+ buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE,
+ space_ids[i], space_versions[i], page_nos[i]);
+ }
+
+ if (err == DB_TABLESPACE_DELETED) {
+ /* We have deleted or are deleting the single-table
+ tablespace: remove the entries for that page */
+
+ ibuf_merge_or_delete_for_page(NULL, space_ids[i],
+ page_nos[i], FALSE);
}
}
@@ -548,8 +630,7 @@ buf_read_ibuf_merge_pages(
buf_flush_free_margin();
if (buf_debug_prints) {
- printf("Ibuf merge read-ahead space %lu pages %lu\n",
- space, n_stored);
+ printf("Ibuf merge read-ahead pages %lu\n", (ulong) n_stored);
}
}
@@ -567,8 +648,12 @@ buf_read_recv_pages(
highest page number the last in the array */
ulint n_stored) /* in: number of page numbers in the array */
{
- ulint count;
- ulint i;
+ ib_longlong tablespace_version;
+ ulint count;
+ ulint err;
+ ulint i;
+
+ tablespace_version = fil_space_get_version(space);
for (i = 0; i < n_stored; i++) {
@@ -576,7 +661,7 @@ buf_read_recv_pages(
os_aio_print_debug = FALSE;
- while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) {
+ while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
os_thread_sleep(500000);
@@ -587,7 +672,7 @@ buf_read_recv_pages(
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for 50 seconds for pending\n"
"InnoDB: reads to the buffer pool to be finished.\n"
-"InnoDB: Number of pending reads %lu\n", buf_pool->n_pend_reads);
+"InnoDB: Number of pending reads %lu\n", (ulong) buf_pool->n_pend_reads);
os_aio_print_debug = TRUE;
}
@@ -596,12 +681,12 @@ buf_read_recv_pages(
os_aio_print_debug = FALSE;
if ((i + 1 == n_stored) && sync) {
- buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space,
- page_nos[i]);
+ buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ tablespace_version, page_nos[i]);
} else {
- buf_read_page_low(FALSE, BUF_READ_ANY_PAGE
+ buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
| OS_AIO_SIMULATED_WAKE_LATER,
- space, page_nos[i]);
+ space, tablespace_version, page_nos[i]);
}
}
@@ -611,6 +696,7 @@ buf_read_recv_pages(
buf_flush_free_margin();
if (buf_debug_prints) {
- printf("Recovery applies read-ahead pages %lu\n", n_stored);
+ printf("Recovery applies read-ahead pages %lu\n",
+ (ulong) n_stored);
}
}
diff --git a/innobase/configure.in b/innobase/configure.in
index a94ade6dc8e..652291f1f38 100644
--- a/innobase/configure.in
+++ b/innobase/configure.in
@@ -34,9 +34,11 @@ CXXFLAGS="$CXXFLAGS "
AC_PROG_CC
AC_PROG_RANLIB
AC_PROG_INSTALL
+AC_PROG_LIBTOOL
AC_CHECK_HEADERS(aio.h sched.h)
AC_CHECK_SIZEOF(int, 4)
AC_CHECK_SIZEOF(long, 4)
+AC_CHECK_SIZEOF(void*, 4)
AC_CHECK_FUNCS(sched_yield)
AC_CHECK_FUNCS(fdatasync)
#AC_CHECK_FUNCS(localtime_r) # Already checked by MySQL
diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c
index c3c2b135717..0ed0efeb160 100644
--- a/innobase/data/data0data.c
+++ b/innobase/data/data0data.c
@@ -196,7 +196,8 @@ dfield_check_typed_no_assert(
fprintf(stderr,
"InnoDB: Error: data field type %lu, len %lu\n",
- dfield_get_type(field)->mtype, dfield_get_len(field));
+ (ulong) dfield_get_type(field)->mtype,
+ (ulong) dfield_get_len(field));
return(FALSE);
}
@@ -219,7 +220,7 @@ dtuple_check_typed_no_assert(
if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
fprintf(stderr,
"InnoDB: Error: index entry has %lu fields\n",
- dtuple_get_n_fields(tuple));
+ (ulong) dtuple_get_n_fields(tuple));
dtuple_sprintf(err_buf, 900, tuple);
fprintf(stderr,
@@ -259,7 +260,8 @@ dfield_check_typed(
fprintf(stderr,
"InnoDB: Error: data field type %lu, len %lu\n",
- dfield_get_type(field)->mtype, dfield_get_len(field));
+ (ulong) dfield_get_type(field)->mtype,
+ (ulong) dfield_get_len(field));
ut_error;
}
@@ -433,7 +435,7 @@ dfield_print_also_hex(
data = dfield_get_data(dfield);
for (i = 0; i < len; i++) {
- printf("%02lx", (ulint)*data);
+ printf("%02lx", (ulong)*data);
data++;
}
@@ -459,10 +461,10 @@ dtuple_print(
n_fields = dtuple_get_n_fields(tuple);
- printf("DATA TUPLE: %lu fields;\n", n_fields);
+ printf("DATA TUPLE: %lu fields;\n", (ulong) n_fields);
for (i = 0; i < n_fields; i++) {
- printf(" %lu:", i);
+ printf(" %lu:", (ulong) i);
field = dtuple_get_nth_field(tuple, i);
@@ -506,7 +508,7 @@ dtuple_sprintf(
return(len);
}
- len += sprintf(buf + len, " %lu:", i);
+ len += sprintf(buf + len, " %lu:", (ulong) i);
field = dtuple_get_nth_field(tuple, i);
@@ -567,7 +569,7 @@ dtuple_convert_big_rec(
if (size > 1000000000) {
fprintf(stderr,
-"InnoDB: Warning: tuple size very big: %lu\n", size);
+"InnoDB: Warning: tuple size very big: %lu\n", (ulong) size);
dtuple_sprintf(err_buf, 900, entry);
fprintf(stderr,
diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c
index df430f06bcb..71ce5ff3d58 100644
--- a/innobase/data/data0type.c
+++ b/innobase/data/data0type.c
@@ -12,10 +12,99 @@ Created 1/16/1996 Heikki Tuuri
#include "data0type.ic"
#endif
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+ulint data_mysql_default_charset_coll = 99999999;
+ulint data_mysql_latin1_swedish_charset_coll = 99999999;
+
dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0};
dtype_t* dtype_binary = &dtype_binary_val;
/*************************************************************************
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type. */
+
+ibool
+dtype_is_string_type(
+/*=================*/
+ /* out: TRUE if string type */
+ ulint mtype) /* in: InnoDB main data type code: DATA_CHAR, ... */
+{
+ if (mtype <= DATA_BLOB
+ || mtype == DATA_MYSQL
+ || mtype == DATA_VARMYSQL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE. */
+
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+ /* out: TRUE if binary string type */
+ ulint mtype, /* in: main data type */
+ ulint prtype) /* in: precise type */
+{
+ if ((mtype == DATA_FIXBINARY)
+ || (mtype == DATA_BINARY)
+ || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE. */
+
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+ /* out: TRUE if non-binary string type */
+ ulint mtype, /* in: main data type */
+ ulint prtype) /* in: precise type */
+{
+ if (dtype_is_string_type(mtype) == TRUE
+ && dtype_is_binary_string_type(mtype, prtype) == FALSE) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code. */
+
+ulint
+dtype_form_prtype(
+/*==============*/
+ ulint old_prtype, /* in: the MySQL type code and the flags
+ DATA_BINARY_TYPE etc. */
+ ulint charset_coll) /* in: MySQL charset-collation code */
+{
+ ut_a(old_prtype < 256 * 256);
+ ut_a(charset_coll < 256);
+
+ return(old_prtype + (charset_coll << 16));
+}
+
+/*************************************************************************
Validates a data type structure. */
ibool
@@ -63,7 +152,7 @@ dtype_print(
} else if (mtype == DATA_SYS) {
printf("DATA_SYS");
} else {
- printf("type %lu", mtype);
+ printf("type %lu", (ulong) mtype);
}
len = type->len;
@@ -86,9 +175,9 @@ dtype_print(
} else if (prtype == DATA_ENGLISH) {
printf("DATA_ENGLISH");
} else {
- printf("prtype %lu", mtype);
+ printf("prtype %lu", (ulong) mtype);
}
}
- printf(" len %lu prec %lu", len, type->prec);
+ printf(" len %lu prec %lu", (ulong) len, (ulong) type->prec);
}
diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c
index 3abb71a842d..46cf6c7788d 100644
--- a/innobase/dict/dict0boot.c
+++ b/innobase/dict/dict0boot.c
@@ -331,8 +331,12 @@ dict_boot(void)
dict_mem_table_add_col(table, (char *) "PAGE_NO", DATA_INT, 0, 4, 0);
/* The '+ 2' below comes from the 2 system fields */
- ut_ad(DICT_SYS_INDEXES_PAGE_NO_FIELD == 6 + 2);
- ut_ad(DICT_SYS_INDEXES_SPACE_NO_FIELD == 5 + 2);
+#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
+#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
+#endif
+#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
+#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
+#endif
table->id = DICT_INDEXES_ID;
dict_table_add_to_cache(table);
@@ -418,6 +422,4 @@ dict_create(void)
dict_boot();
dict_insert_initial_data();
-
- sync_order_checks_on = TRUE;
}
diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c
index 48fcb9c1e79..6a951317d47 100644
--- a/innobase/dict/dict0crea.c
+++ b/innobase/dict/dict0crea.c
@@ -269,6 +269,8 @@ dict_build_table_def_step(
dict_table_t* table;
dict_table_t* cluster_table;
dtuple_t* row;
+ ulint error;
+ mtr_t mtr;
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(dict_sys->mutex)));
@@ -297,6 +299,32 @@ dict_build_table_def_step(
table->mix_id = dict_hdr_get_new_id(DICT_HDR_MIX_ID);
}
+ if (srv_file_per_table) {
+ /* We create a new single-table tablespace for the table.
+ We initially let it be 4 pages:
+ - page 0 is the fsp header and an extent descriptor page,
+ - page 1 is an ibuf bitmap page,
+ - page 2 is the first inode page,
+ - page 3 will contain the root of the clustered index of the
+ table we create here. */
+
+ table->space = 0; /* reset to zero for the call below */
+
+ error = fil_create_new_single_table_tablespace(
+ &(table->space), table->name,
+ FIL_IBD_FILE_INITIAL_SIZE);
+ if (error != DB_SUCCESS) {
+
+ return(error);
+ }
+
+ mtr_start(&mtr);
+
+ fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+
+ mtr_commit(&mtr);
+ }
+
row = dict_create_sys_tables_tuple(table, node->heap);
ins_node_set_new_row(node->tab_def, row);
@@ -488,8 +516,8 @@ dict_create_sys_fields_tuple(
}
/*********************************************************************
-Creates the tuple with which the index entry is searched for
-writing the index tree root page number, if such a tree is created. */
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created. */
static
dtuple_t*
dict_create_search_tuple(
@@ -558,10 +586,10 @@ dict_build_index_def_step(
index->id = dict_hdr_get_new_id(DICT_HDR_INDEX_ID);
- if (index->type & DICT_CLUSTERED) {
- /* Inherit the space from the table */
- index->space = table->space;
- }
+ /* Inherit the space id from the table; we store all indexes of a
+ table in the same tablespace */
+
+ index->space = table->space;
index->page_no = FIL_NULL;
@@ -647,6 +675,9 @@ dict_create_index_tree_step(
index->page_no = btr_create(index->type, index->space, index->id,
&mtr);
+ /* printf("Created a new index tree in space %lu root page %lu\n",
+ index->space, index->page_no); */
+
page_rec_write_index_page_no(btr_pcur_get_rec(&pcur),
DICT_SYS_INDEXES_PAGE_NO_FIELD,
index->page_no, &mtr);
@@ -697,7 +728,14 @@ dict_drop_index_tree(
ut_ad(len == 4);
space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
-
+
+ if (!fil_tablespace_exists_in_mem(space)) {
+ /* It is a single table tablespace and the .ibd file is
+ missing: do nothing */
+
+ return;
+ }
+
/* We free all the pages but the root page first; this operation
may span several mini-transactions */
@@ -707,6 +745,8 @@ dict_drop_index_tree(
we write FIL_NULL to the appropriate field in the SYS_INDEXES
record: this mini-transaction marks the B-tree totally freed */
+ /* printf("Dropping index tree in space %lu root page %lu\n", space,
+ root_page_no); */
btr_free_root(space, root_page_no, mtr);
page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
@@ -1107,7 +1147,8 @@ dict_create_or_check_foreign_constraint_tables(void)
error = trx->error_state;
if (error != DB_SUCCESS) {
- fprintf(stderr, "InnoDB: error %lu in creation\n", error);
+ fprintf(stderr, "InnoDB: error %lu in creation\n",
+ (ulong) error);
ut_a(error == DB_OUT_OF_FILE_SPACE);
@@ -1206,7 +1247,7 @@ loop:
/* Generate a new constraint id */
foreign->id = mem_heap_alloc(foreign->heap, namelen + 20);
/* no overflow if number < 1e13 */
- sprintf(foreign->id, "%s_ibfk_%lu", table->name, number);
+ sprintf(foreign->id, "%s_ibfk_%lu", table->name, (ulong) number);
number++;
}
@@ -1219,8 +1260,8 @@ loop:
foreign->id,
table->name,
foreign->referenced_table_name,
- foreign->n_fields
- + (foreign->type << 24));
+ (ulong) (foreign->n_fields
+ + (foreign->type << 24)));
for (i = 0; i < foreign->n_fields; i++) {
ut_a(len < (sizeof buf)
@@ -1230,7 +1271,7 @@ loop:
len += sprintf(buf + len,
"INSERT INTO SYS_FOREIGN_COLS VALUES('%s', %lu, '%s', '%s');\n",
foreign->id,
- i,
+ (ulong) i,
foreign->foreign_col_names[i],
foreign->referenced_col_names[i]);
}
@@ -1274,7 +1315,7 @@ loop:
if (error != DB_SUCCESS) {
fprintf(stderr,
"InnoDB: Foreign key constraint creation failed:\n"
- "InnoDB: internal error number %lu\n", error);
+ "InnoDB: internal error number %lu\n", (ulong) error);
mutex_enter(&dict_foreign_err_mutex);
ut_sprintf_timestamp(ebuf);
diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c
index dc19997de72..bf60567ceaa 100644
--- a/innobase/dict/dict0dict.c
+++ b/innobase/dict/dict0dict.c
@@ -139,7 +139,8 @@ dict_tree_find_index_low(
/*=====================*/
/* out: index */
dict_tree_t* tree, /* in: index tree */
- rec_t* rec); /* in: record for which to find correct index */
+ rec_t* rec); /* in: record for which to find correct
+ index */
/**************************************************************************
Removes a foreign constraint struct from the dictionet cache. */
static
@@ -742,7 +743,7 @@ dict_table_get_and_increment_handle_count(
mutex_exit(&(dict_sys->mutex));
if (table != NULL) {
- if (!table->stat_initialized) {
+ if (!table->stat_initialized && !table->ibd_file_missing) {
dict_update_statistics(table);
}
}
@@ -897,6 +898,7 @@ dict_table_rename_in_cache(
ulint old_size;
char* name_buf;
char* old_name;
+ ibool success;
ulint i;
ut_ad(table);
@@ -914,6 +916,21 @@ dict_table_rename_in_cache(
HASH_SEARCH(name_hash, dict_sys->table_hash, fold, table2,
(ut_strcmp(table2->name, new_name) == 0));
if (table2) {
+ fprintf(stderr,
+"InnoDB: Error: dictionary cache already contains a table of name %s\n",
+ new_name);
+ return(FALSE);
+ }
+ }
+
+ /* If the table is stored in a single-table tablespace, rename the
+ .ibd file */
+
+ if (table->space != 0) {
+ success = fil_rename_tablespace(table->name, table->space,
+ new_name);
+ if (!success) {
+
return(FALSE);
}
}
@@ -942,7 +959,6 @@ dict_table_rename_in_cache(
/* Add table to hash table of tables */
HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
table);
-
dict_sys->size += (mem_heap_get_size(table->heap) - old_size);
/* Update the table_name field in indexes */
@@ -1088,6 +1104,31 @@ dict_table_rename_in_cache(
}
/**************************************************************************
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /* in: table object already in cache */
+ dulint new_id) /* in: new id to set */
+{
+ ut_ad(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ /* Remove the table from the hash table of id's */
+
+ HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+ ut_fold_dulint(table->id), table);
+ table->id = new_id;
+
+ /* Add the table back to the hash table */
+ HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash,
+ ut_fold_dulint(table->id), table);
+}
+
+/**************************************************************************
Removes a table object from the dictionary cache. */
void
@@ -2735,7 +2776,7 @@ dict_create_foreign_constraints_low(
sprintf(buf + strlen(buf),
" Error in foreign key constraint of table %.500s.\n"
"Cannot find the table from the internal data dictionary of InnoDB.\n"
-"Create table statement:\n%.2000\n", name, sql_string);
+"Create table statement:\n%.2000s\n", name, sql_string);
ut_a(strlen(buf) < DICT_FOREIGN_ERR_BUF_LEN);
mutex_exit(&dict_foreign_err_mutex);
@@ -3594,8 +3635,8 @@ dict_tree_free(
/*===========*/
dict_tree_t* tree) /* in, own: index tree */
{
- ut_ad(tree);
- ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+ ut_a(tree);
+ ut_a(tree->magic_n == DICT_TREE_MAGIC_N);
rw_lock_free(&(tree->lock));
mem_free(tree);
@@ -3609,7 +3650,8 @@ dict_tree_find_index_low(
/*=====================*/
/* out: index */
dict_tree_t* tree, /* in: index tree */
- rec_t* rec) /* in: record for which to find correct index */
+ rec_t* rec) /* in: record for which to find correct
+ index */
{
dict_index_t* index;
dict_table_t* table;
@@ -3647,7 +3689,8 @@ dict_tree_find_index(
/*=================*/
/* out: index */
dict_tree_t* tree, /* in: index tree */
- rec_t* rec) /* in: record for which to find correct index */
+ rec_t* rec) /* in: record for which to find correct
+ index */
{
dict_index_t* index;
@@ -3737,7 +3780,8 @@ dict_tree_build_node_ptr(
/*=====================*/
/* out, own: node pointer */
dict_tree_t* tree, /* in: index tree */
- rec_t* rec, /* in: record for which to build node pointer */
+ rec_t* rec, /* in: record for which to build node
+ pointer */
ulint page_no,/* in: page number to put in node pointer */
mem_heap_t* heap, /* in: memory heap where pointer created */
ulint level) /* in: level of rec in tree: 0 means leaf
@@ -3899,6 +3943,16 @@ dict_update_statistics_low(
ulint size;
ulint sum_of_index_sizes = 0;
+ if (table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: cannot calculate statistics for table %s\n"
+"InnoDB: because the .ibd file is missing. See section 15.1 of\n"
+"InnoDB: http:/www.innodb.com/ibman.html for help\n", table->name);
+
+ return;
+ }
+
/* If we have set a high innodb_force_recovery level, do not calculate
statistics, as a badly corrupted index can cause a crash in it. */
@@ -4053,10 +4107,11 @@ dict_table_print_low(
printf(
"TABLE: name %s, id %lu %lu, columns %lu, indexes %lu, appr.rows %lu\n",
table->name,
- ut_dulint_get_high(table->id),
- ut_dulint_get_low(table->id),
- table->n_cols, UT_LIST_GET_LEN(table->indexes),
- (ulint)table->stat_n_rows);
+ (ulong) ut_dulint_get_high(table->id),
+ (ulong) ut_dulint_get_low(table->id),
+ (ulong) table->n_cols,
+ (ulong) UT_LIST_GET_LEN(table->indexes),
+ (ulong) table->stat_n_rows);
printf(" COLUMNS: ");
for (i = 0; i < table->n_cols - 1; i++) {
@@ -4136,16 +4191,16 @@ dict_index_print_low(
printf(
" INDEX: name %s, table name %s, id %lu %lu, fields %lu/%lu, type %lu\n",
index->name, index->table_name,
- ut_dulint_get_high(tree->id),
- ut_dulint_get_low(tree->id),
- index->n_user_defined_cols,
- index->n_fields, index->type);
+ (ulong) ut_dulint_get_high(tree->id),
+ (ulong) ut_dulint_get_low(tree->id),
+ (ulong) index->n_user_defined_cols,
+ (ulong) index->n_fields, (ulong) index->type);
printf(
" root page %lu, appr.key vals %lu, leaf pages %lu, size pages %lu\n",
- tree->page,
- (ulint)n_vals,
- index->stat_n_leaf_pages,
- index->stat_index_size);
+ (ulong) tree->page,
+ (ulong) n_vals,
+ (ulong) index->stat_n_leaf_pages,
+ (ulong) index->stat_index_size);
printf(" FIELDS: ");
@@ -4175,7 +4230,7 @@ dict_field_print_low(
printf(" %s", field->name);
if (field->prefix_len != 0) {
- printf("(%lu)", field->prefix_len);
+ printf("(%lu)", (ulong) field->prefix_len);
}
}
diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c
index 5a5830a2517..c6a8ebc4b55 100644
--- a/innobase/dict/dict0load.c
+++ b/innobase/dict/dict0load.c
@@ -19,6 +19,7 @@ Created 4/24/1996 Heikki Tuuri
#include "mach0data.h"
#include "dict0dict.h"
#include "dict0boot.h"
+#include "srv0start.h"
/************************************************************************
Finds the first table name in the given database. */
@@ -122,8 +123,8 @@ dict_print(void)
rec_t* rec;
byte* field;
ulint len;
- char table_name[10000];
mtr_t mtr;
+ char table_name[10000];
mutex_enter(&(dict_sys->mutex));
@@ -188,6 +189,100 @@ loop:
}
/************************************************************************
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+In a normal startup we just scan the biggest space id, and store it to
+fil_system. */
+
+void
+dict_check_tablespaces_or_store_max_id(
+/*===================================*/
+ ibool in_crash_recovery) /* in: are we doing a crash recovery */
+{
+ dict_table_t* sys_tables;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ rec_t* rec;
+ byte* field;
+ ulint len;
+ ulint space_id;
+ ulint max_space_id = 0;
+ mtr_t mtr;
+ char name[OS_FILE_MAX_PATH];
+
+ mutex_enter(&(dict_sys->mutex));
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low((char *) "SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+
+ btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
+ TRUE, &mtr);
+loop:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) {
+ /* end of index */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ /* We must make the tablespace cache aware of the biggest
+ known space id */
+
+ /* printf("Biggest space id in data dictionary %lu\n",
+ max_space_id); */
+ fil_set_max_space_id_if_bigger(max_space_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return;
+ }
+
+ field = rec_get_nth_field(rec, 0, &len);
+
+ if (!rec_get_deleted_flag(rec)) {
+
+ /* We found one */
+
+ ut_a(len < OS_FILE_MAX_PATH - 10);
+ ut_memcpy(name, field, len);
+ name[len] = '\0';
+
+ field = rec_get_nth_field(rec, 9, &len);
+ ut_a(len == 4);
+
+ space_id = mach_read_from_4(field);
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (space_id != 0 && in_crash_recovery) {
+ /* Check that the tablespace (the .ibd file) really
+ exists; print a warning to the .err log if not */
+
+ fil_space_for_table_exists_in_mem(space_id, name,
+ TRUE, TRUE);
+ }
+
+ if (space_id > max_space_id) {
+ max_space_id = space_id;
+ }
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+ }
+
+ goto loop;
+}
+
+/************************************************************************
Loads definitions for table columns. */
static
void
@@ -269,6 +364,15 @@ dict_load_columns(
field = rec_get_nth_field(rec, 6, &len);
prtype = mach_read_from_4(field);
+ if (dtype_is_non_binary_string_type(mtype, prtype)
+ && dtype_get_charset_coll(prtype) == 0) {
+ /* This is a non-binary string type, and the table
+ was created with < 4.1.2. Use the default charset. */
+
+ prtype = dtype_form_prtype(prtype,
+ data_mysql_default_charset_coll);
+ }
+
field = rec_get_nth_field(rec, 7, &len);
col_len = mach_read_from_4(field);
@@ -365,13 +469,13 @@ dict_load_fields(
pos_and_prefix_len = mach_read_from_4(field);
- ut_a((pos_and_prefix_len & 0xFFFF) == i
- || (pos_and_prefix_len & 0xFFFF0000) == (i << 16));
+ ut_a((pos_and_prefix_len & 0xFFFFUL) == i
+ || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16));
if ((i == 0 && pos_and_prefix_len > 0)
- || (pos_and_prefix_len & 0xFFFF0000) > 0) {
+ || (pos_and_prefix_len & 0xFFFF0000UL) > 0) {
- prefix_len = pos_and_prefix_len & 0xFFFF;
+ prefix_len = pos_and_prefix_len & 0xFFFFUL;
} else {
prefix_len = 0;
}
@@ -486,7 +590,7 @@ dict_load_indexes(
ut_ad(len == 8);
id = mach_read_from_8(field);
- ut_a(0 == ut_strcmp((char*)"NAME",
+ ut_a(0 == ut_strcmp((char*) "NAME",
dict_field_get_col(
dict_index_get_nth_field(
dict_table_get_first_index(sys_indexes), 4))->name));
@@ -545,11 +649,11 @@ dict_load_indexes(
&& ((type & DICT_CLUSTERED)
|| ((table == dict_sys->sys_tables)
&& (name_len == ut_strlen("ID_IND"))
- && (0 == ut_memcmp(name_buf, (char*)"ID_IND",
+ && (0 == ut_memcmp(name_buf, (char*) "ID_IND",
name_len))))) {
- /* The index was created in memory already in
- booting */
+ /* The index was created in memory already at booting
+ of the database server */
} else {
index = dict_mem_index_create(table->name, name_buf,
space, type, n_fields);
@@ -580,9 +684,14 @@ dictionary cache. */
dict_table_t*
dict_load_table(
/*============*/
- /* out: table, NULL if does not exist */
- char* name) /* in: table name */
+ /* out: table, NULL if does not exist; if the table is
+ stored in an .ibd file, but the file does not exist,
+ then we set the ibd_file_missing flag TRUE in the table
+ object we return */
+ char* name) /* in: table name in the databasename/tablename
+ format */
{
+ ibool ibd_file_missing = FALSE;
dict_table_t* table;
dict_table_t* sys_tables;
btr_pcur_t pcur;
@@ -651,6 +760,23 @@ dict_load_table(
field = rec_get_nth_field(rec, 9, &len);
space = mach_read_from_4(field);
+ /* Check if the tablespace exists and has the right name */
+ if (space != 0) {
+ if (fil_space_for_table_exists_in_mem(space, name, FALSE,
+ FALSE)) {
+ /* Ok; (if we did a crash recovery then the tablespace
+ can already be in the memory cache) */
+ } else {
+ /* Try to open the tablespace */
+ if (!fil_open_single_table_tablespace(space, name)) {
+ /* We failed to find a sensible tablespace
+ file */
+
+ ibd_file_missing = TRUE;
+ }
+ }
+ }
+
ut_a(0 == ut_strcmp((char *) "N_COLS",
dict_field_get_col(
dict_index_get_nth_field(
@@ -661,6 +787,8 @@ dict_load_table(
table = dict_mem_table_create(name, space, n_cols);
+ table->ibd_file_missing = ibd_file_missing;
+
ut_a(0 == ut_strcmp((char *) "ID",
dict_field_get_col(
dict_index_get_nth_field(
@@ -1021,7 +1149,7 @@ dict_load_foreign(
/* We store the type to the bits 24-31 of n_fields */
foreign->type = foreign->n_fields >> 24;
- foreign->n_fields = foreign->n_fields & 0xFFFFFF;
+ foreign->n_fields = foreign->n_fields & 0xFFFFFFUL;
foreign->id = mem_heap_alloc(foreign->heap, ut_strlen(id) + 1);
diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c
index c9eb7a9d8bd..c49738a0960 100644
--- a/innobase/dict/dict0mem.c
+++ b/innobase/dict/dict0mem.c
@@ -56,6 +56,8 @@ dict_mem_table_create(
table->type = DICT_TABLE_ORDINARY;
table->name = str;
table->space = space;
+ table->ibd_file_missing = FALSE;
+ table->tablespace_discarded = FALSE;
table->n_def = 0;
table->n_cols = n_cols + DATA_N_SYS_COLS;
table->mem_fix = 0;
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
index 9f33013d2f9..768dda4eedc 100644
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@@ -1,5 +1,5 @@
/******************************************************
-The low-level file system
+The tablespace memory cache
(c) 1995 Innobase Oy
@@ -16,16 +16,22 @@ Created 10/25/1995 Heikki Tuuri
#include "mach0data.h"
#include "ibuf0ibuf.h"
#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
#include "log0log.h"
#include "log0recv.h"
#include "fsp0fsp.h"
#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+
/*
- IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM
- ===========================================
+ IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+ =============================================
-The file system is responsible for providing fast read/write access to
+The tablespace cache is responsible for providing fast read/write access to
tablespaces and logs of the database. File creation and deletion is done
in other modules which know more of the logic of the operation, however.
@@ -77,26 +83,42 @@ out of the LRU-list and keep a count of pending operations. When an operation
completes, we decrement the count and return the file node to the LRU-list if
the count drops to zero. */
+/* When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+char* fil_path_to_mysql_datadir = (char*)".";
+
ulint fil_n_pending_log_flushes = 0;
ulint fil_n_pending_tablespace_flushes = 0;
/* Null file address */
fil_addr_t fil_addr_null = {FIL_NULL, 0};
-/* File system file node data structure */
+/* File node of a tablespace or the log data space */
typedef struct fil_node_struct fil_node_t;
struct fil_node_struct {
- char* name; /* the file name or path */
+ fil_space_t* space; /* backpointer to the space where this node
+ belongs */
+ char* name; /* path to the file */
ibool open; /* TRUE if file open */
os_file_t handle; /* OS handle to the file, if file open */
- ulint size; /* size of the file in database pages
- (where the possible last incomplete megabyte
- is ignored) */
+ ibool is_raw_disk;/* TRUE if the 'file' is actually a raw
+ device or a raw disk partition */
+ ulint size; /* size of the file in database pages, 0 if
+ not known yet; the possible last incomplete
+ megabyte is ignored if space == 0 */
ulint n_pending;
- /* count of pending i/o-ops on this file */
- ibool is_modified; /* this is set to TRUE when we write
- to the file and FALSE when we call fil_flush
- for this file space */
+ /* count of pending i/o's on this file;
+ closing of the file is not allowed if
+ this is > 0 */
+ ulint n_pending_flushes;
+ /* count of pending flushes on this file;
+ closing of the file is not allowed if
+ this is > 0 */
+ ib_longlong modification_counter;/* when we write to the file we
+ increment this by one */
+ ib_longlong flush_counter;/* up to what modification_counter value
+ we have flushed the modifications to disk */
UT_LIST_NODE_T(fil_node_t) chain;
/* link field for the file chain */
UT_LIST_NODE_T(fil_node_t) LRU;
@@ -106,19 +128,52 @@ struct fil_node_struct {
#define FIL_NODE_MAGIC_N 89389
-/* File system tablespace or log data structure: let us call them by a common
-name space */
+/* Tablespace or log data space: let us call them by a common name space */
struct fil_space_struct {
- char* name; /* space name */
+ char* name; /* space name = the path to the first file in
+ it */
ulint id; /* space id */
+ ib_longlong tablespace_version;
+ /* in DISCARD/IMPORT this timestamp is used to
+ check if we should ignore an insert buffer
+ merge request for a page because it actually
+ was for the previous incarnation of the
+ space */
+ ibool mark; /* this is set to TRUE at database startup if
+ the space corresponds to a table in the InnoDB
+ data dictionary; so we can print a warning of
+ orphaned tablespaces */
+ ibool stop_ios;/* TRUE if we want to rename the .ibd file of
+ tablespace and want to stop temporarily
+ posting of new i/o requests on the file */
+ ibool stop_ibuf_merges;
+ /* we set this TRUE when we start deleting a
+ single-table tablespace */
+ ibool is_being_deleted;
+ /* this is set to TRUE when we start
+ deleting a single-table tablespace and its
+ file; when this flag is set no further i/o
+ or flush requests can be placed on this space,
+ though there may be such requests still being
+ processed on this space */
ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */
UT_LIST_BASE_NODE_T(fil_node_t) chain;
/* base node for the file chain */
- ulint size; /* space size in pages */
+ ulint size; /* space size in pages; 0 if a single-table
+ tablespace whose size we do not know yet */
ulint n_reserved_extents;
/* number of reserved free extents for
ongoing operations like B-tree page split */
+ ulint n_pending_flushes; /* this is > 0 when flushing
+ the tablespace to disk; dropping of the
+ tablespace is forbidden if this is > 0 */
+ ulint n_pending_ibuf_merges;/* this is > 0 when merging
+ insert buffer entries to a page so that we
+ may need to access the ibuf bitmap page in the
+ tablespade: dropping of the tablespace is
+ forbidden if this is > 0 */
hash_node_t hash; /* hash chain node */
+ hash_node_t name_hash;/* hash chain the name_hash table */
rw_lock_t latch; /* latch protecting the file space storage
allocation */
UT_LIST_NODE_T(fil_space_t) space_list;
@@ -130,80 +185,126 @@ struct fil_space_struct {
#define FIL_SPACE_MAGIC_N 89472
-/* The file system data structure */
+/* The tablespace memory cache; also the totality of logs = the log data space,
+is stored here; below we talk about tablespaces, but also the ib_logfiles
+form a 'space' and it is handled here */
typedef struct fil_system_struct fil_system_t;
struct fil_system_struct {
- mutex_t mutex; /* The mutex protecting the system */
+ mutex_t mutex; /* The mutex protecting the cache */
hash_table_t* spaces; /* The hash table of spaces in the
- system */
+ system; they are hashed on the space
+ id */
+ hash_table_t* name_hash; /* hash table based on the space
+ name */
UT_LIST_BASE_NODE_T(fil_node_t) LRU;
/* base node for the LRU list of the
- most recently used open files */
- ulint n_open_pending; /* current number of open files with
- pending i/o-ops on them */
- ulint max_n_open; /* maximum allowed open files */
- os_event_t can_open; /* this event is set to the signaled
- state when the system is capable of
- opening a new file, i.e.,
- n_open_pending < max_n_open */
+ most recently used open files with no
+ pending i/o's; if we start an i/o on
+ the file, we first remove it from this
+ list, and return it to the start of
+ the list when the i/o ends;
+ log files and the system tablespace are
+ not put to this list: they are opened
+ after the startup, and kept open until
+ shutdown */
+ ulint n_open; /* number of files currently open */
+ ulint max_n_open; /* n_open is not allowed to exceed
+ this */
+ ib_longlong modification_counter;/* when we write to a file we
+ increment this by one */
+ ulint max_assigned_id;/* maximum space id in the existing
+ tables, or assigned during the time
+ mysqld has been up; at an InnoDB
+ startup we scan the data dictionary
+ and set here the maximum of the
+ space id's of the tables there */
+ ib_longlong tablespace_version;
+ /* a counter which is incremented for
+ every space object memory creation;
+ every space mem object gets a
+ 'timestamp' from this; in DISCARD/
+ IMPORT this is used to check if we
+ should ignore an insert buffer merge
+ request */
UT_LIST_BASE_NODE_T(fil_space_t) space_list;
/* list of all file spaces */
};
-/* The file system. This variable is NULL before the module is initialized. */
+/* The tablespace memory cache. This variable is NULL before the module is
+initialized. */
fil_system_t* fil_system = NULL;
-/* The file system hash table size */
-#define FIL_SYSTEM_HASH_SIZE 500
+/* The tablespace memory cache hash table size */
+#define FIL_SYSTEM_HASH_SIZE 50 /* TODO: make bigger! */
-/***********************************************************************
-Reserves a right to open a single file. The right must be released with
-fil_release_right_to_open. */
+/************************************************************************
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
void
-fil_reserve_right_to_open(void)
-/*===========================*/
-{
-loop:
- mutex_enter(&(fil_system->mutex));
-
- if (fil_system->n_open_pending == fil_system->max_n_open) {
-
- /* It is not sure we can open the file if it is closed: wait */
-
- os_event_reset(fil_system->can_open);
-
- mutex_exit(&(fil_system->mutex));
+fil_node_prepare_for_io(
+/*====================*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system, /* in: tablespace memory cache */
+ fil_space_t* space); /* in: space */
+/************************************************************************
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system, /* in: tablespace memory cache */
+ ulint type); /* in: OS_FILE_WRITE or OS_FILE_READ; marks
+ the node as modified if
+ type == OS_FILE_WRITE */
+/***********************************************************************
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache. */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+ /* out: space id, ULINT_UNDEFINED if not
+ found */
+ char* name); /* in: table name in the standard
+ 'databasename/tablename' format */
- os_event_wait(fil_system->can_open);
- goto loop;
- }
+/***********************************************************************
+Returns the version number of a tablespace, -1 if not found. */
- fil_system->max_n_open--;
+ib_longlong
+fil_space_get_version(
+/*==================*/
+ /* out: version number, -1 if the tablespace does not
+ exist in the memory cache */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ ib_longlong version = -1;
- mutex_exit(&(fil_system->mutex));
-}
+ ut_ad(system);
-/***********************************************************************
-Releases a right to open a single file. */
+ mutex_enter(&(system->mutex));
-void
-fil_release_right_to_open(void)
-/*===========================*/
-{
- mutex_enter(&(fil_system->mutex));
-
- if (fil_system->n_open_pending == fil_system->max_n_open) {
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
- os_event_set(fil_system->can_open);
+ if (space) {
+ version = space->tablespace_version;
}
- fil_system->max_n_open++;
+ mutex_exit(&(system->mutex));
- mutex_exit(&(fil_system->mutex));
+ return(version);
}
/***********************************************************************
@@ -215,8 +316,8 @@ fil_space_get_latch(
/* out: latch protecting storage allocation */
ulint id) /* in: space id */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
+ fil_space_t* space;
ut_ad(system);
@@ -224,6 +325,8 @@ fil_space_get_latch(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+ ut_a(space);
+
mutex_exit(&(system->mutex));
return(&(space->latch));
@@ -238,8 +341,8 @@ fil_space_get_type(
/* out: FIL_TABLESPACE or FIL_LOG */
ulint id) /* in: space id */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
+ fil_space_t* space;
ut_ad(system);
@@ -247,6 +350,8 @@ fil_space_get_type(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+ ut_a(space);
+
mutex_exit(&(system->mutex));
return(space->purpose);
@@ -261,17 +366,21 @@ fil_space_get_ibuf_data(
/* out: ibuf data for this space */
ulint id) /* in: space id */
{
+ fil_system_t* system = fil_system;
fil_space_t* space;
- fil_system_t* system = fil_system;
ut_ad(system);
+ ut_a(id == 0);
+
mutex_enter(&(system->mutex));
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
mutex_exit(&(system->mutex));
+ ut_a(space);
+
return(space->ibuf_data);
}
@@ -284,16 +393,16 @@ fil_node_create(
char* name, /* in: file name (file must be closed) */
ulint size, /* in: file size in database blocks, rounded downwards
to an integer */
- ulint id) /* in: space id where to append */
+ ulint id, /* in: space id where to append */
+ ibool is_raw) /* in: TRUE if a raw device or a raw disk partition */
{
+ fil_system_t* system = fil_system;
fil_node_t* node;
fil_space_t* space;
char* name2;
- fil_system_t* system = fil_system;
ut_a(system);
ut_a(name);
- ut_a(size > 0);
mutex_enter(&(system->mutex));
@@ -305,29 +414,122 @@ fil_node_create(
node->name = name2;
node->open = FALSE;
+
+ ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+ node->is_raw_disk = is_raw;
node->size = size;
node->magic_n = FIL_NODE_MAGIC_N;
node->n_pending = 0;
+ node->n_pending_flushes = 0;
- node->is_modified = FALSE;
+ node->modification_counter = 0;
+ node->flush_counter = 0;
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+ if (!space) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: Could not find tablespace %lu for\n"
+"InnoDB: file %s from the tablespace memory cache.\n", (ulong) id, name);
+ mem_free(name2);
+
+ mem_free(node);
+
+ mutex_exit(&(system->mutex));
+
+ return;
+ }
+
space->size += size;
+ node->space = space;
+
UT_LIST_ADD_LAST(chain, space->chain, node);
mutex_exit(&(system->mutex));
}
+/************************************************************************
+Opens a the file of a node of a tablespace. The caller must own the fil_system
+mutex. */
+static
+void
+fil_node_open_file(
+/*===============*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system, /* in: tablespace memory cache */
+ fil_space_t* space) /* in: space */
+{
+ ib_longlong size_bytes;
+ ulint size_low;
+ ulint size_high;
+ ibool ret;
+
+ ut_ad(mutex_own(&(system->mutex)));
+
+ ut_a(node->n_pending == 0);
+ ut_a(node->open == FALSE);
+
+ /* printf("Opening file %s\n", node->name); */
+
+ if (space->purpose == FIL_LOG) {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_LOG_FILE, &ret);
+ } else if (node->is_raw_disk) {
+ node->handle = os_file_create(node->name,
+ OS_FILE_OPEN_RAW,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
+ } else {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
+ }
+
+ ut_a(ret);
+
+ node->open = TRUE;
+
+ system->n_open++;
+
+ if (node->size == 0) {
+ os_file_get_size(node->handle, &size_low, &size_high);
+
+ size_bytes = (((ib_longlong)size_high) << 32)
+ + (ib_longlong)size_low;
+#ifdef UNIV_HOTBACKUP
+ node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+
+#else
+ /* It must be a single-table tablespace and we do not know the
+ size of the file yet */
+
+ ut_a(space->id != 0);
+
+ if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) {
+ node->size = (ulint) ((size_bytes / (1024 * 1024))
+ * ((1024 * 1024) / UNIV_PAGE_SIZE));
+ } else {
+ node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ }
+#endif
+ space->size += node->size;
+ }
+
+ if (space->purpose == FIL_TABLESPACE && space->id != 0) {
+ /* Put the node to the LRU list */
+ UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+ }
+}
+
/**************************************************************************
Closes a file. */
static
void
-fil_node_close(
-/*===========*/
+fil_node_close_file(
+/*================*/
fil_node_t* node, /* in: file node */
- fil_system_t* system) /* in: file system */
+ fil_system_t* system) /* in: tablespace memory cache */
{
ibool ret;
@@ -337,24 +539,208 @@ fil_node_close(
#endif /* UNIV_SYNC_DEBUG */
ut_a(node->open);
ut_a(node->n_pending == 0);
+ ut_a(node->n_pending_flushes == 0);
+ ut_a(node->modification_counter == node->flush_counter);
ret = os_file_close(node->handle);
ut_a(ret);
+ /* printf("Closing file %s\n", node->name); */
+
node->open = FALSE;
+ ut_a(system->n_open > 0);
+ system->n_open--;
+
+ if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) {
+ ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+ /* The node is in the LRU list, remove it */
+ UT_LIST_REMOVE(LRU, system->LRU, node);
+ }
+}
+
+/************************************************************************
+Tries to close a file in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
+ibool
+fil_try_to_close_file_in_LRU(
+/*=========================*/
+ /* out: TRUE if success, FALSE if should retry
+ later; since i/o's generally complete in <
+ 100 ms, and as InnoDB writes at most 128 pages
+ from the buffer pool in a batch, and then
+ immediately flushes the files, there is a good
+ chance that the next time we find a suitable
+ node from the LRU list */
+ ibool print_info) /* in: if TRUE, prints information why it
+ cannot close a file */
+{
+ fil_system_t* system = fil_system;
+ fil_node_t* node;
+
+ ut_ad(mutex_own(&(system->mutex)));
+
+ node = UT_LIST_GET_LAST(system->LRU);
+
+ if (print_info) {
+ fprintf(stderr,
+"InnoDB: fil_sys open file LRU len %lu\n", (ulong) UT_LIST_GET_LEN(system->LRU));
+ }
+
+ while (node != NULL) {
+ if (node->modification_counter == node->flush_counter
+ && node->n_pending_flushes == 0) {
+
+ fil_node_close_file(node, system);
+
+ return(TRUE);
+ }
+
+ if (print_info && node->n_pending_flushes > 0) {
+ fprintf(stderr,
+"InnoDB: cannot close file %s, because n_pending_flushes %lu\n", node->name,
+ (ulong) node->n_pending_flushes);
+ }
+
+ if (print_info
+ && node->modification_counter != node->flush_counter) {
+ fprintf(stderr,
+"InnoDB: cannot close file %s, because mod_count %lld != fl_count %lld\n",
+ node->name, node->modification_counter,
+ node->flush_counter);
+ }
+
+ node = UT_LIST_GET_PREV(LRU, node);
+ }
- /* The node is in the LRU list, remove it */
- UT_LIST_REMOVE(LRU, system->LRU, node);
+ return(FALSE);
}
/***********************************************************************
-Frees a file node object from a file system. */
+Reserves the fil_system mutex and tries to make sure we can open at least one
+file while holding it. This should be called before calling
+fil_node_prepare_for_io(), because that function may need to open a file. */
+static
+void
+fil_mutex_enter_and_prepare_for_io(
+/*===============================*/
+ ulint space_id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ ibool success;
+ ibool print_info = FALSE;
+ ulint count = 0;
+ ulint count2 = 0;
+
+ ut_ad(!mutex_own(&(system->mutex)));
+retry:
+ mutex_enter(&(system->mutex));
+
+ if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+ /* We keep log files and system tablespace files always open;
+ this is important in preventing deadlocks in this module, as
+ a page read completion often performs another read from the
+ insert buffer. The insert buffer is in tablespace 0, and we
+ cannot end up waiting in this function. */
+
+ return;
+ }
+
+ if (system->n_open < system->max_n_open) {
+
+ return;
+ }
+
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ if (space != NULL && space->stop_ios) {
+ /* We are going to do a rename file and want to stop new i/o's
+ for a while */
+
+ if (count2 > 20000) {
+ fprintf(stderr,
+"InnoDB: Warning: tablespace %s has i/o ops stopped for a long time %lu\n",
+ space->name,
+ (ulong) count2);
+ }
+
+ mutex_exit(&(system->mutex));
+
+ os_thread_sleep(20000);
+
+ count2++;
+
+ goto retry;
+ }
+
+ /* If the file is already open, no need to do anything; if the space
+ does not exist, we handle the situation in the function which called
+ this function */
+
+ if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
+
+ return;
+ }
+
+ if (count > 1) {
+ print_info = TRUE;
+ }
+
+ /* Too many files are open, try to close some */
+close_more:
+ success = fil_try_to_close_file_in_LRU(print_info);
+
+ if (success && system->n_open >= system->max_n_open) {
+
+ goto close_more;
+ }
+
+ if (system->n_open < system->max_n_open) {
+ /* Ok */
+
+ return;
+ }
+
+ if (count >= 2) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: too many (%lu) files stay open while the maximum\n"
+"InnoDB: allowed value would be %lu.\n"
+"InnoDB: You may need to raise the value of innodb_max_files_open in\n"
+"InnoDB: my.cnf.\n", (ulong) system->n_open, (ulong) system->max_n_open);
+
+ return;
+ }
+
+ mutex_exit(&(system->mutex));
+
+#ifndef UNIV_HOTBACKUP
+ /* Wake the i/o-handler threads to make sure pending i/o's are
+ performed */
+ os_aio_simulated_wake_handler_threads();
+
+ os_thread_sleep(20000);
+#endif
+ /* Flush tablespaces so that we can close modified files in the LRU
+ list */
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ count++;
+
+ goto retry;
+}
+
+/***********************************************************************
+Frees a file node object from a tablespace memory cache. */
static
void
fil_node_free(
/*==========*/
fil_node_t* node, /* in, own: file node */
- fil_system_t* system, /* in: file system */
+ fil_system_t* system, /* in: tablespace memory cache */
fil_space_t* space) /* in: space where the file node is chained */
{
ut_ad(node && system && space);
@@ -362,9 +748,15 @@ fil_node_free(
ut_ad(mutex_own(&(system->mutex)));
#endif /* UNIV_SYNC_DEBUG */
ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+ ut_a(node->n_pending == 0);
if (node->open) {
- fil_node_close(node, system);
+ /* We fool the assertion in fil_node_close_file() to think
+ there are no unflushed modifications in the file */
+
+ node->modification_counter = node->flush_counter;
+
+ fil_node_close_file(node, system);
}
space->size -= node->size;
@@ -387,9 +779,9 @@ fil_space_truncate_start(
if this does not equal to the combined size of
some initial files in the space */
{
+ fil_system_t* system = fil_system;
fil_node_t* node;
fil_space_t* space;
- fil_system_t* system = fil_system;
mutex_enter(&(system->mutex));
@@ -398,7 +790,6 @@ fil_space_truncate_start(
ut_a(space);
while (trunc_len > 0) {
-
node = UT_LIST_GET_FIRST(space->chain);
ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len);
@@ -409,17 +800,346 @@ fil_space_truncate_start(
}
mutex_exit(&(system->mutex));
-}
+}
+
+/***********************************************************************
+Creates a space memory object and puts it to the tablespace memory cache. If
+there is an error, prints an error message to the .err log. */
+
+ibool
+fil_space_create(
+/*=============*/
+ /* out: TRUE if success */
+ char* name, /* in: space name */
+ ulint id, /* in: space id */
+ ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ char* name2;
+ ulint namesake_id;
+try_again:
+ /*printf(
+ "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name,
+ purpose);*/
+
+ ut_a(system);
+ ut_a(name);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(name), space,
+ 0 == strcmp(name, space->name));
+ if (space != NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: trying to init to the tablespace memory cache\n"
+"InnoDB: a tablespace %lu of name %s,\n"
+"InnoDB: but a tablespace %lu of the same name %s\n"
+"InnoDB: already exists in the tablespace memory cache!\n",
+ (ulong) id, name,
+ (ulong) space->id, space->name);
+
+ if (id == 0 || purpose != FIL_TABLESPACE) {
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ fprintf(stderr,
+"InnoDB: We assume that InnoDB did a crash recovery, and you had\n"
+"InnoDB: an .ibd file for which the table did not exist in the\n"
+"InnoDB: InnoDB internal data dictionary in the ibdata files.\n"
+"InnoDB: We assume that you later removed the .ibd and .frm files,\n"
+"InnoDB: and are now trying to recreate the table. We now remove the\n"
+"InnoDB: conflicting tablespace object from the memory cache and try\n"
+"InnoDB: the init again.\n");
+
+ namesake_id = space->id;
+
+ mutex_exit(&(system->mutex));
+
+ fil_space_free(namesake_id);
+
+ goto try_again;
+ }
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space != NULL) {
+ fprintf(stderr,
+"InnoDB: Error: trying to add tablespace %lu of name %s\n"
+"InnoDB: to the tablespace memory cache, but tablespace\n"
+"InnoDB: %lu of name %s already exists in the tablespace\n"
+"InnoDB: memory cache!\n", (ulong) id, name, (ulong) space->id, space->name);
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ space = mem_alloc(sizeof(fil_space_t));
+
+ name2 = mem_alloc(ut_strlen(name) + 1);
+
+ ut_strcpy(name2, name);
+
+ space->name = name2;
+ space->id = id;
+
+ system->tablespace_version++;
+ space->tablespace_version = system->tablespace_version;
+ space->mark = FALSE;
+
+ if (purpose == FIL_TABLESPACE && id > system->max_assigned_id) {
+ system->max_assigned_id = id;
+ }
+
+ space->stop_ios = FALSE;
+ space->stop_ibuf_merges = FALSE;
+ space->is_being_deleted = FALSE;
+ space->purpose = purpose;
+ space->size = 0;
+
+ space->n_reserved_extents = 0;
+
+ space->n_pending_flushes = 0;
+ space->n_pending_ibuf_merges = 0;
+
+ UT_LIST_INIT(space->chain);
+ space->magic_n = FIL_SPACE_MAGIC_N;
+
+ space->ibuf_data = NULL;
+
+ rw_lock_create(&(space->latch));
+ rw_lock_set_level(&(space->latch), SYNC_FSP);
+
+ HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
+
+ HASH_INSERT(fil_space_t, name_hash, system->name_hash,
+ ut_fold_string(name), space);
+ UT_LIST_ADD_LAST(space_list, system->space_list, space);
+
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's. */
+static
+ulint
+fil_assign_new_space_id(void)
+/*=========================*/
+ /* out: new tablespace id; ULINT_UNDEFINED if could
+ not assign an id */
+{
+ fil_system_t* system = fil_system;
+ ulint id;
+
+ mutex_enter(&(system->mutex));
+
+ system->max_assigned_id++;
+
+ id = system->max_assigned_id;
+
+ if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+"InnoDB: Warning: you are running out of new single-table tablespace id's.\n"
+"InnoDB: Current counter is %lu and it must not exceed %lu!\n"
+"InnoDB: To reset the counter to zero you have to dump all your tables and\n"
+"InnoDB: recreate the whole InnoDB installation.\n", (ulong) id,
+ (ulong) SRV_LOG_SPACE_FIRST_ID);
+ }
+
+ if (id >= SRV_LOG_SPACE_FIRST_ID) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+"InnoDB: You have run out of single-table tablespace id's!\n"
+"InnoDB: Current counter is %lu.\n"
+"InnoDB: To reset the counter to zero you have to dump all your tables and\n"
+"InnoDB: recreate the whole InnoDB installation.\n", (ulong) id);
+ system->max_assigned_id--;
+
+ id = ULINT_UNDEFINED;
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(id);
+}
+
+/***********************************************************************
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files. */
+
+ibool
+fil_space_free(
+/*===========*/
+ /* out: TRUE if success */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ fil_space_t* namespace;
+ fil_node_t* fil_node;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (!space) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: trying to remove tablespace %lu from the cache but\n"
+"InnoDB: it is not there.\n", (ulong) id);
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ HASH_DELETE(fil_space_t, hash, system->spaces, id, space);
+
+ HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(space->name),
+ namespace, 0 == strcmp(space->name, namespace->name));
+ ut_a(namespace);
+ ut_a(space == namespace);
+
+ HASH_DELETE(fil_space_t, name_hash, system->name_hash,
+ ut_fold_string(space->name), space);
+
+ UT_LIST_REMOVE(space_list, system->space_list, space);
+
+ ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+ ut_a(0 == space->n_pending_flushes);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+
+ while (fil_node != NULL) {
+ fil_node_free(fil_node, system, space);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+ }
+
+ ut_a(0 == UT_LIST_GET_LEN(space->chain));
+
+ mutex_exit(&(system->mutex));
+
+ rw_lock_free(&(space->latch));
+
+ mem_free(space->name);
+ mem_free(space);
+
+ return(TRUE);
+}
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************
+Returns the tablespace object for a given id, or NULL if not found from the
+tablespace memory cache. */
+static
+fil_space_t*
+fil_get_space_for_id_low(
+/*=====================*/
+ /* out: tablespace object or NULL; NOTE that you must
+ own &(fil_system->mutex) to call this function! */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+
+ ut_ad(system);
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ return(space);
+}
+#endif
+
+/***********************************************************************
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache. */
+
+ulint
+fil_space_get_size(
+/*===============*/
+ /* out: space size, 0 if space not found */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_node_t* node;
+ fil_space_t* space;
+ ulint size;
+
+ ut_ad(system);
+
+ fil_mutex_enter_and_prepare_for_io(id);
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space == NULL) {
+ mutex_exit(&(system->mutex));
+
+ return(0);
+ }
+
+ if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+ ut_a(id != 0);
+
+ ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ /* It must be a single-table tablespace and we have not opened
+ the file yet; the following calls will open it and update the
+ size fields */
+
+ fil_node_prepare_for_io(node, system, space);
+ fil_node_complete_io(node, system, OS_FILE_READ);
+ }
+
+ size = space->size;
+
+ mutex_exit(&(system->mutex));
+
+ return(size);
+}
+
+/***********************************************************************
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache. */
+
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ /* out: TRUE if the address is meaningful */
+ ulint id, /* in: space id */
+ ulint page_no)/* in: page number */
+{
+ if (fil_space_get_size(id) > page_no) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
/********************************************************************
-Creates a file system object. */
+Creates a the tablespace memory cache. */
static
fil_system_t*
fil_system_create(
/*==============*/
- /* out, own: file system object */
+ /* out, own: tablespace memory cache */
ulint hash_size, /* in: hash table size */
- ulint max_n_open) /* in: maximum number of open files */
+ ulint max_n_open) /* in: maximum number of open files; must be
+ > 10 */
{
fil_system_t* system;
@@ -433,12 +1153,17 @@ fil_system_create(
mutex_set_level(&(system->mutex), SYNC_ANY_LATCH);
system->spaces = hash_create(hash_size);
+ system->name_hash = hash_create(hash_size);
UT_LIST_INIT(system->LRU);
- system->n_open_pending = 0;
+ system->n_open = 0;
system->max_n_open = max_n_open;
- system->can_open = os_event_create(NULL);
+
+ system->modification_counter = 0;
+ system->max_assigned_id = 0;
+
+ system->tablespace_version = 0;
UT_LIST_INIT(system->space_list);
@@ -446,7 +1171,7 @@ fil_system_create(
}
/********************************************************************
-Initializes the file system of this module. */
+Initializes the tablespace memory cache. */
void
fil_init(
@@ -455,11 +1180,120 @@ fil_init(
{
ut_a(fil_system == NULL);
+ /*printf("Initializing the tablespace cache with max %lu open files\n",
+ max_n_open); */
fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open);
}
+/***********************************************************************
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+
+void
+fil_open_log_and_system_tablespace_files(void)
+/*==========================================*/
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ fil_node_t* node;
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_FIRST(system->space_list);
+
+ while (space != NULL) {
+ if (space->purpose != FIL_TABLESPACE || space->id == 0) {
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node != NULL) {
+ if (!node->open) {
+ fil_node_open_file(node, system,
+ space);
+ }
+ if (system->max_n_open < 10 + system->n_open) {
+ fprintf(stderr,
+"InnoDB: Warning: you must raise the value of innodb_max_open_files in\n"
+"InnoDB: my.cnf! Remember that InnoDB keeps all log files and all system\n"
+"InnoDB: tablespace files open for the whole time mysqld is running, and\n"
+"InnoDB: needs to open also some .ibd files if the file-per-table storage\n"
+"InnoDB: model is used. Current open files %lu, max allowed open files %lu.\n",
+ (ulong) system->n_open,
+ (ulong) system->max_n_open);
+ }
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/***********************************************************************
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+
+void
+fil_close_all_files(void)
+/*=====================*/
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ fil_node_t* node;
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_FIRST(system->space_list);
+
+ while (space != NULL) {
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node != NULL) {
+ if (node->open) {
+ fil_node_close_file(node, system);
+ }
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/***********************************************************************
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id) /* in: maximum known id */
+{
+ fil_system_t* system = fil_system;
+
+ if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
+ fprintf(stderr,
+"InnoDB: Fatal error: max tablespace id is too high, %lu\n", (ulong) max_id);
+ ut_a(0);
+ }
+
+ mutex_enter(&(system->mutex));
+
+ if (system->max_assigned_id < max_id) {
+
+ system->max_assigned_id = max_id;
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
/********************************************************************
-Writes the flushed lsn to the header of each file space. */
+Initializes the ibuf data structure for space 0 == the system tablespace.
+This can be called after the file space headers have been created and the
+dictionary system has been initialized. */
void
fil_ibuf_init_at_db_start(void)
@@ -468,39 +1302,37 @@ fil_ibuf_init_at_db_start(void)
fil_space_t* space;
space = UT_LIST_GET_FIRST(fil_system->space_list);
-
- while (space) {
- if (space->purpose == FIL_TABLESPACE) {
- space->ibuf_data = ibuf_data_init_for_space(space->id);
- }
- space = UT_LIST_GET_NEXT(space_list, space);
- }
+ ut_a(space);
+ ut_a(space->purpose == FIL_TABLESPACE);
+
+ space->ibuf_data = ibuf_data_init_for_space(space->id);
}
/********************************************************************
-Writes the flushed lsn and the latest archived log number to the page
-header of the first page of a data file. */
+Writes the flushed lsn and the latest archived log number to the page header
+of the first page of a data file. */
static
ulint
fil_write_lsn_and_arch_no_to_file(
/*==============================*/
ulint space_id, /* in: space number */
- ulint sum_of_sizes, /* in: combined size of previous files in space,
- in database pages */
+ ulint sum_of_sizes, /* in: combined size of previous files in
+ space, in database pages */
dulint lsn, /* in: lsn to write */
ulint arch_log_no) /* in: archived log number to write */
{
byte* buf1;
byte* buf;
+ UT_NOT_USED(arch_log_no);
+
buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
buf = ut_align(buf1, UNIV_PAGE_SIZE);
fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
- mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no);
fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
@@ -509,7 +1341,7 @@ fil_write_lsn_and_arch_no_to_file(
/********************************************************************
Writes the flushed lsn and the latest archived log number to the page
-header of the first page of each data file. */
+header of the first page of each data file in the system tablespace. */
ulint
fil_write_flushed_lsn_to_data_files(
@@ -528,18 +1360,22 @@ fil_write_flushed_lsn_to_data_files(
space = UT_LIST_GET_FIRST(fil_system->space_list);
while (space) {
+ /* We only write the lsn to all existing data files which have
+ been open during the lifetime of the mysqld process; they are
+ represented by the space objects in the tablespace memory
+ cache. Note that all data files in the system tablespace 0 are
+ always open. */
+
if (space->purpose == FIL_TABLESPACE) {
sum_of_sizes = 0;
node = UT_LIST_GET_FIRST(space->chain);
-
while (node) {
mutex_exit(&(fil_system->mutex));
err = fil_write_lsn_and_arch_no_to_file(
- space->id,
- sum_of_sizes,
- lsn, arch_log_no);
+ space->id, sum_of_sizes,
+ lsn, arch_log_no);
if (err != DB_SUCCESS) {
return(err);
@@ -548,11 +1384,9 @@ fil_write_flushed_lsn_to_data_files(
mutex_enter(&(fil_system->mutex));
sum_of_sizes += node->size;
-
node = UT_LIST_GET_NEXT(chain, node);
}
}
-
space = UT_LIST_GET_NEXT(space_list, space);
}
@@ -579,8 +1413,9 @@ fil_read_flushed_lsn_and_arch_log_no(
byte* buf;
byte* buf2;
dulint flushed_lsn;
- ulint arch_log_no;
-
+ ulint arch_log_no = 0; /* since InnoDB does not archive
+ its own logs under MySQL, this
+ parameter is not relevant */
buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
/* Align the memory for a possible read from a raw device */
buf = ut_align(buf2, UNIV_PAGE_SIZE);
@@ -588,7 +1423,6 @@ fil_read_flushed_lsn_and_arch_log_no(
os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN);
- arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO);
ut_free(buf2);
@@ -615,113 +1449,1442 @@ fil_read_flushed_lsn_and_arch_log_no(
}
}
+/*================ SINGLE-TABLE TABLESPACES ==========================*/
+
/***********************************************************************
-Creates a space object and puts it to the file system. */
+Increments the count of pending insert buffer page merges, if space is not
+being deleted. */
+
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+ /* out: TRUE if being deleted, and ibuf merges should
+ be skipped */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space == NULL) {
+ fprintf(stderr,
+"InnoDB: Error: trying to do ibuf merge to a dropped tablespace %lu\n",
+ (ulong) id);
+ }
+
+ if (space == NULL || space->stop_ibuf_merges) {
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+ }
+
+ space->n_pending_ibuf_merges++;
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+}
+
+/***********************************************************************
+Decrements the count of pending insert buffer page merges. */
void
-fil_space_create(
+fil_decr_pending_ibuf_merges(
+/*========================*/
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space == NULL) {
+ fprintf(stderr,
+"InnoDB: Error: decrementing ibuf merge of a dropped tablespace %lu\n",
+ (ulong) id);
+ }
+
+ if (space != NULL) {
+ space->n_pending_ibuf_merges--;
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/************************************************************
+Creates the database directory for a table if it does not exist yet. */
+static
+void
+fil_create_directory_for_tablename(
+/*===============================*/
+ char* name) /* in: name in the standard 'databasename/tablename'
+ format */
+{
+ char* ptr;
+ char path[OS_FILE_MAX_PATH];
+
+ sprintf(path, "%s/%s", fil_path_to_mysql_datadir, name);
+
+ ptr = path + ut_strlen(path);
+
+ while (*ptr != '/') {
+ ptr--;
+
+ ut_a(ptr >= path);
+ }
+
+ *ptr = '\0';
+
+ srv_normalize_path_for_win(path);
+
+ ut_a(os_file_create_directory(path, FALSE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************
+Writes a log record about an .ibd file create/rename/delete. */
+static
+void
+fil_op_write_log(
/*=============*/
- char* name, /* in: space name */
- ulint id, /* in: space id */
- ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */
+ ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id, /* in: space id */
+ char* name, /* in: table name in the familiar
+ 'databasename/tablename' format, or the file
+ path in the case of MLOG_FILE_DELETE */
+ char* new_name, /* in: if type is MLOG_FILE_RENAME, the new
+ table name in the 'databasename/tablename'
+ format */
+ mtr_t* mtr) /* in: mini-transaction handle */
{
- fil_space_t* space;
- char* name2;
- fil_system_t* system = fil_system;
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 30);
- ut_a(system);
- ut_a(name);
+ log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0,
+ log_ptr, mtr);
+ /* Let us store the strings as null-terminated for easier readability
+ and handling */
+
+ mach_write_to_2(log_ptr, ut_strlen(name) + 1);
+ log_ptr += 2;
+
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, (byte*) name, ut_strlen(name) + 1);
-#ifndef UNIV_BASIC_LOG_DEBUG
- /* Spaces with an odd id number are reserved to replicate spaces
- used in log debugging */
+ if (type == MLOG_FILE_RENAME) {
+ log_ptr = mlog_open(mtr, 30);
+ mach_write_to_2(log_ptr, ut_strlen(new_name) + 1);
+ log_ptr += 2;
- ut_a((purpose == FIL_LOG) || (id % 2 == 0));
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, (byte*) new_name,
+ ut_strlen(new_name) + 1);
+ }
+}
#endif
- mutex_enter(&(system->mutex));
- space = mem_alloc(sizeof(fil_space_t));
+/***********************************************************************
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations. */
+
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+ /* out: end of log record, or NULL if the
+ record was not completely contained between
+ ptr and end_ptr */
+ byte* ptr, /* in: buffer containing the log record body,
+ or an initial segment of it, if the record does
+ not fir completely between ptr and end_ptr */
+ byte* end_ptr, /* in: buffer end */
+ ulint type, /* in: the type of this log record */
+ ibool do_replay, /* in: TRUE if we want to replay the
+ operation, and not just parse the log record */
+ ulint space_id) /* in: if do_replay is TRUE, the space id of
+ the tablespace in question; otherwise
+ ignored */
+{
+ ulint name_len;
+ ulint new_name_len;
+ char* name;
+ char* new_name = NULL;
- name2 = mem_alloc(ut_strlen(name) + 1);
+ if (end_ptr < ptr + 2) {
- ut_strcpy(name2, name);
+ return(NULL);
+ }
- space->name = name2;
- space->id = id;
- space->purpose = purpose;
- space->size = 0;
+ name_len = mach_read_from_2(ptr);
- space->n_reserved_extents = 0;
+ ptr += 2;
+
+ if (end_ptr < ptr + name_len) {
+
+ return(NULL);
+ }
+
+ name = (char*) ptr;
+
+ ptr += name_len;
+
+ if (type == MLOG_FILE_RENAME) {
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ new_name_len = mach_read_from_2(ptr);
+
+ ptr += 2;
+
+ if (end_ptr < ptr + new_name_len) {
+
+ return(NULL);
+ }
+
+ new_name = (char*) ptr;
+
+ ptr += new_name_len;
+ }
+
+ /* We managed to parse a full log record body */
+/*
+ printf("Parsed log rec of type %lu space %lu\n"
+ "name %s\n", type, space_id, name);
+
+ if (type == MLOG_FILE_RENAME) {
+ printf("new name %s\n", new_name);
+ }
+*/
+ if (do_replay == FALSE) {
+
+ return(ptr);
+ }
+
+ /* Let us try to perform the file operation, if sensible. Note that
+ ibbackup has at this stage already read in all space id info to the
+ fil0fil.c data structures.
- UT_LIST_INIT(space->chain);
- space->magic_n = FIL_SPACE_MAGIC_N;
+ NOTE that our algorithm is not guaranteed to work correctly if there
+ were renames of tables during the backup. See ibbackup code for more
+ on the problem. */
- space->ibuf_data = NULL;
+ if (type == MLOG_FILE_DELETE) {
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ ut_a(fil_delete_tablespace(space_id));
+ }
+ } else if (type == MLOG_FILE_RENAME) {
+ /* We do the rename based on space id, not old file name;
+ this should guarantee that after the log replay each .ibd file
+ has the correct name for the latest log sequence number; the
+ proof is left as an exercise :) */
+
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ /* Create the database directory for the new name, if
+ it does not exist yet */
+ fil_create_directory_for_tablename(new_name);
- rw_lock_create(&(space->latch));
- rw_lock_set_level(&(space->latch), SYNC_FSP);
+ /* Rename the table if there is not yet a tablespace
+ with the same name */
+
+ if (fil_get_space_id_for_table(new_name)
+ == ULINT_UNDEFINED) {
+ /* We do not care of the old name, that is
+ why we pass NULL as the first argument */
+ ut_a(fil_rename_tablespace(NULL, space_id,
+ new_name));
+ }
+ }
+ } else {
+ ut_a(type == MLOG_FILE_CREATE);
+
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ /* Do nothing */
+ } else if (fil_get_space_id_for_table(name) !=
+ ULINT_UNDEFINED) {
+ /* Do nothing */
+ } else {
+ /* Create the database directory for name, if it does
+ not exist yet */
+ fil_create_directory_for_tablename(name);
+
+ ut_a(space_id != 0);
+
+ ut_a(DB_SUCCESS ==
+ fil_create_new_single_table_tablespace(
+ &space_id, name,
+ FIL_IBD_FILE_INITIAL_SIZE));
+ }
+ }
+
+ return(ptr);
+}
+
+/***********************************************************************
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache. */
+
+ibool
+fil_delete_tablespace(
+/*==================*/
+ /* out: TRUE if success */
+ ulint id) /* in: space id */
+{
+ fil_system_t* system = fil_system;
+ ibool success;
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint count = 0;
+ char path[OS_FILE_MAX_PATH];
+
+ ut_a(id != 0);
+stop_ibuf_merges:
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space != NULL) {
+ space->stop_ibuf_merges = TRUE;
+
+ if (space->n_pending_ibuf_merges == 0) {
+ mutex_exit(&(system->mutex));
+
+ count = 0;
+
+ goto try_again;
+ } else {
+ if (count > 5000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: trying to delete tablespace %s,\n"
+"InnoDB: but there are %lu pending ibuf merges on it.\n"
+"InnoDB: Loop %lu.\n", space->name, (ulong) space->n_pending_ibuf_merges,
+ (ulong) count);
+ }
+
+ mutex_exit(&(system->mutex));
+
+ os_thread_sleep(20000);
+ count++;
+
+ goto stop_ibuf_merges;
+ }
+ }
+
+ mutex_exit(&(system->mutex));
+ count = 0;
+
+try_again:
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: cannot delete tablespace %lu\n"
+"InnoDB: because it is not found in the tablespace memory cache.\n",
+ (ulong) id);
+
+ mutex_exit(&(system->mutex));
- HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
+ return(FALSE);
+ }
+
+ ut_a(space);
+ ut_a(strlen(space->name) < OS_FILE_MAX_PATH);
+ ut_a(space->n_pending_ibuf_merges == 0);
+
+ strcpy(path, space->name);
+
+ space->is_being_deleted = TRUE;
+
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ if (space->n_pending_flushes > 0 || node->n_pending > 0) {
+ if (count > 1000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: trying to delete tablespace %s,\n"
+"InnoDB: but there are %lu flushes and %lu pending i/o's on it\n"
+"InnoDB: Loop %lu.\n", space->name, (ulong) space->n_pending_flushes,
+ (ulong) node->n_pending,
+ (ulong) count);
+ }
+ mutex_exit(&(system->mutex));
+ os_thread_sleep(20000);
+
+ count++;
+
+ goto try_again;
+ }
- UT_LIST_ADD_LAST(space_list, system->space_list, space);
-
mutex_exit(&(system->mutex));
+#ifndef UNIV_HOTBACKUP
+ /* Invalidate in the buffer pool all pages belonging to the
+ tablespace. Since we have set space->is_being_deleted = TRUE, readahead
+ or ibuf merge can no longer read more pages of this tablespace to the
+ buffer pool. Thus we can clean the tablespace out of the buffer pool
+ completely and permanently. The flag is_being_deleted also prevents
+ fil_flush() from being applied to this tablespace. */
+
+ buf_LRU_invalidate_tablespace(id);
+#endif
+ /* printf("Deleting tablespace %s id %lu\n", space->name, id); */
+
+ success = fil_space_free(id);
+
+ if (success) {
+ success = os_file_delete(path);
+
+ if (success) {
+ /* Write a log record about the deletion of the .ibd
+ file, so that ibbackup can replay it in the
+ --apply-log phase. We use a dummy mtr and the familiar
+ log write mechanism. */
+#ifndef UNIV_HOTBACKUP
+ {
+ mtr_t mtr;
+
+ /* When replaying the operation in ibbackup, do not try
+ to write any log record */
+ mtr_start(&mtr);
+
+ fil_op_write_log(MLOG_FILE_DELETE, id, path,
+ NULL, &mtr);
+ mtr_commit(&mtr);
+ }
+#endif
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
}
/***********************************************************************
-Frees a space object from a file system. Closes the files in the chain
-but does not delete them. */
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had. */
-void
-fil_space_free(
-/*===========*/
+ibool
+fil_discard_tablespace(
+/*===================*/
+ /* out: TRUE if success */
ulint id) /* in: space id */
{
+ ibool success;
+
+ success = fil_delete_tablespace(id);
+
+ if (!success) {
+ fprintf(stderr,
+"InnoDB: Warning: cannot delete tablespace %lu in DISCARD TABLESPACE.\n"
+"InnoDB: But let us remove the insert buffer entries for this tablespace.\n",
+ (ulong) id);
+ }
+
+ /* Remove all insert buffer entries for the tablespace */
+
+ ibuf_delete_for_discarded_space(id);
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Renames the memory cache structures of a single-table tablespace. */
+static
+ibool
+fil_rename_tablespace_in_mem(
+/*=========================*/
+ /* out: TRUE if success */
+ fil_space_t* space, /* in: tablespace memory object */
+ fil_node_t* node, /* in: file node of that tablespace */
+ char* path) /* in: new name */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space2;
+ char* old_name = space->name;
+
+ HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(old_name),
+ space2, 0 == strcmp(old_name, space2->name));
+ if (space != space2) {
+ fprintf(stderr,
+"InnoDB: Error: cannot find %s in tablespace memory cache\n", old_name);
+
+ return(FALSE);
+ }
+
+ HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(path),
+ space2, 0 == strcmp(path, space2->name));
+ if (space2 != NULL) {
+ fprintf(stderr,
+"InnoDB: Error: %s is already in tablespace memory cache\n", path);
+
+ return(FALSE);
+ }
+
+ HASH_DELETE(fil_space_t, name_hash, system->name_hash,
+ ut_fold_string(space->name), space);
+ mem_free(space->name);
+ mem_free(node->name);
+
+ space->name = mem_alloc(strlen(path) + 1);
+ node->name = mem_alloc(strlen(path) + 1);
+
+ strcpy(space->name, path);
+ strcpy(node->name, path);
+
+ HASH_INSERT(fil_space_t, name_hash, system->name_hash,
+ ut_fold_string(path), space);
+ return(TRUE);
+}
+
+/***********************************************************************
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache. */
+
+ibool
+fil_rename_tablespace(
+/*==================*/
+ /* out: TRUE if success */
+ char* old_name, /* in: old table name in the standard
+ databasename/tablename format of InnoDB, or
+ NULL if we do the rename based on the space
+ id only */
+ ulint id, /* in: space id */
+ char* new_name) /* in: new table name in the standard
+ databasename/tablename format of InnoDB */
+{
+ fil_system_t* system = fil_system;
+ ibool success;
fil_space_t* space;
- fil_node_t* fil_node;
- fil_system_t* system = fil_system;
+ fil_node_t* node;
+ ulint count = 0;
+ char* path = NULL;
+ ibool old_name_was_specified = TRUE;
+ char old_path[OS_FILE_MAX_PATH];
+
+ ut_a(id != 0);
+ if (old_name == NULL) {
+ old_name = (char*)"(name not specified)";
+ old_name_was_specified = FALSE;
+ }
+retry:
+ count++;
+
+ if (count > 1000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: problems renaming %s to %s, %lu iterations\n",
+ old_name, new_name,
+ (ulong) count);
+ }
+
mutex_enter(&(system->mutex));
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
- HASH_DELETE(fil_space_t, hash, system->spaces, id, space);
+ if (space == NULL) {
+ fprintf(stderr,
+"InnoDB: Error: cannot find space id %lu from the tablespace memory cache\n"
+"InnoDB: though the table %s in a rename operation should have that id\n",
+ (ulong) id, old_name);
+ mutex_exit(&(system->mutex));
- UT_LIST_REMOVE(space_list, system->space_list, space);
+ return(FALSE);
+ }
- ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+ if (count > 25000) {
+ space->stop_ios = FALSE;
+ mutex_exit(&(system->mutex));
- fil_node = UT_LIST_GET_FIRST(space->chain);
+ return(FALSE);
+ }
- ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
+ /* We temporarily close the .ibd file because we do not trust that
+ operating systems can rename an open file. For the closing we have to
+ wait until there are no pending i/o's or flushes on the file. */
- while (fil_node != NULL) {
- fil_node_free(fil_node, system, space);
+ space->stop_ios = TRUE;
- fil_node = UT_LIST_GET_FIRST(space->chain);
- }
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ if (node->n_pending > 0 || node->n_pending_flushes > 0) {
+ /* There are pending i/o's or flushes, sleep for a while and
+ retry */
+
+ mutex_exit(&(system->mutex));
+
+ os_thread_sleep(20000);
+
+ goto retry;
+
+ } else if (node->modification_counter > node->flush_counter) {
+ /* Flush the space */
+
+ mutex_exit(&(system->mutex));
+
+ os_thread_sleep(20000);
+
+ fil_flush(id);
+
+ goto retry;
+
+ } else if (node->open) {
+ /* Close the file */
+
+ fil_node_close_file(node, system);
+ }
+
+ /* Check that the old name in the space is right */
+
+ if (old_name_was_specified) {
+ ut_a(strlen(old_name) + strlen(fil_path_to_mysql_datadir)
+ < OS_FILE_MAX_PATH - 10);
+ sprintf(old_path, "%s/%s.ibd", fil_path_to_mysql_datadir,
+ old_name);
+ srv_normalize_path_for_win(old_path);
+
+ ut_a(strcmp(space->name, old_path) == 0);
+ ut_a(strcmp(node->name, old_path) == 0);
+ } else {
+ sprintf(old_path, "%s", space->name);
+ }
+
+ /* Rename the tablespace and the node in the memory cache */
- ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
- ut_ad(0 == UT_LIST_GET_LEN(space->chain));
+ ut_a(strlen(new_name) + strlen(fil_path_to_mysql_datadir)
+ < OS_FILE_MAX_PATH - 10);
+ path = mem_alloc(OS_FILE_MAX_PATH);
+
+ sprintf(path, "%s/%s.ibd", fil_path_to_mysql_datadir, new_name);
+
+ srv_normalize_path_for_win(path);
+
+ success = fil_rename_tablespace_in_mem(space, node, path);
+
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ success = os_file_rename(old_path, path);
+
+ if (!success) {
+ /* We have to revert the changes we made to the tablespace
+ memory cache */
+
+ ut_a(fil_rename_tablespace_in_mem(space, node, old_path));
+ }
+
+func_exit:
+ if (path) {
+ mem_free(path);
+ }
+ space->stop_ios = FALSE;
mutex_exit(&(system->mutex));
- mem_free(space->name);
- mem_free(space);
+#ifndef UNIV_HOTBACKUP
+ if (success) {
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ fil_op_write_log(MLOG_FILE_RENAME, id, old_name, new_name,
+ &mtr);
+ mtr_commit(&mtr);
+ }
+#endif
+ return(success);
}
/***********************************************************************
-Returns the size of the space in pages. */
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. */
ulint
-fil_space_get_size(
-/*===============*/
- /* out: space size */
- ulint id) /* in: space id */
+fil_create_new_single_table_tablespace(
+/*===================================*/
+ /* out: DB_SUCCESS or error code */
+ ulint* space_id, /* in/out: space id; if this is != 0, then
+ this is an input parameter, otherwise
+ output */
+ char* tablename, /* in: the table name in the usual
+ databasename/tablename format of InnoDB */
+ ulint size) /* in: the initial size of the tablespace file
+ in pages, must be >= FIL_IBD_FILE_INITIAL_SIZE
+ */
+{
+ os_file_t file;
+ ibool ret;
+ ulint err;
+ byte* page;
+ ibool success;
+ char path[OS_FILE_MAX_PATH];
+
+ ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+
+ ut_a(strlen(tablename) + strlen(fil_path_to_mysql_datadir)
+ < OS_FILE_MAX_PATH - 10);
+ sprintf(path, "%s/%s.ibd", fil_path_to_mysql_datadir, tablename);
+
+ srv_normalize_path_for_win(path);
+
+ file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ if (ret == FALSE) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error creating file %s.\n", path);
+
+ /* The following call will print an error message */
+
+ err = os_file_get_last_error(TRUE);
+
+ if (err == OS_FILE_ALREADY_EXISTS) {
+ fprintf(stderr,
+"InnoDB: The file already exists though the corresponding table did not\n"
+"InnoDB: exist in the InnoDB data dictionary. Have you moved InnoDB\n"
+"InnoDB: .ibd files around without using the SQL commands\n"
+"InnoDB: DISCARD TABLESPACE and IMPORT TABLESPACE, or did\n"
+"InnoDB: mysqld crash in the middle of CREATE TABLE? You can\n"
+"InnoDB: resolve the problem by removing the file %s\n"
+"InnoDB: under the 'datadir' of MySQL.\n", path);
+
+ return(DB_TABLESPACE_ALREADY_EXISTS);
+ }
+
+ if (err == OS_FILE_DISK_FULL) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ return(DB_ERROR);
+ }
+
+ page = ut_malloc(UNIV_PAGE_SIZE);
+
+ ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0);
+
+ if (!ret) {
+ ut_free(page);
+ os_file_close(file);
+ os_file_delete(path);
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ if (*space_id == 0) {
+ *space_id = fil_assign_new_space_id();
+ }
+
+ /* printf("Creating tablespace %s id %lu\n", path, *space_id); */
+
+ if (*space_id == ULINT_UNDEFINED) {
+ ut_free(page);
+ os_file_close(file);
+ os_file_delete(path);
+
+ return(DB_ERROR);
+ }
+
+ /* We have to write the space id to the file immediately and flush the
+ file to disk. This is because in crash recovery we must be aware what
+ tablespaces exist and what are their space id's, so that we can apply
+ the log records to the right file. It may take quite a while until
+ buffer pool flush algorithms write anything to the file and flush it to
+ disk. If we would not write here anything, the file would be filled
+ with zeros from the call of os_file_set_size(), until a buffer pool
+ flush would write to it. */
+
+ memset(page, '\0', UNIV_PAGE_SIZE);
+
+ fsp_header_write_space_id(page, *space_id);
+
+ buf_flush_init_for_writing(page, ut_dulint_zero, *space_id, 0);
+
+ ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ ut_free(page);
+
+ if (!ret) {
+ fprintf(stderr,
+"InnoDB: Error: could not write the first page to tablespace %s\n", path);
+
+ os_file_close(file);
+ os_file_delete(path);
+
+ return(DB_ERROR);
+ }
+
+ ret = os_file_flush(file);
+
+ if (!ret) {
+ fprintf(stderr,
+"InnoDB: Error: file flush of tablespace %s failed\n", path);
+
+ os_file_close(file);
+ os_file_delete(path);
+
+ return(DB_ERROR);
+ }
+
+ os_file_close(file);
+
+ if (*space_id == ULINT_UNDEFINED) {
+ os_file_delete(path);
+
+ return(DB_ERROR);
+ }
+
+ success = fil_space_create(path, *space_id, FIL_TABLESPACE);
+
+ if (!success) {
+ os_file_delete(path);
+
+ return(DB_ERROR);
+ }
+
+ fil_node_create(path, size, *space_id, FALSE);
+
+#ifndef UNIV_HOTBACKUP
+ {
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ fil_op_write_log(MLOG_FILE_CREATE, *space_id, tablename, NULL, &mtr);
+
+ mtr_commit(&mtr);
+ }
+#endif
+ return(DB_SUCCESS);
+}
+
+/************************************************************************
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn. */
+
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+ /* out: TRUE if success */
+ char* name, /* in: table name in the databasename/tablename
+ format */
+ dulint current_lsn) /* in: reset lsn's if the lsn stamped to
+ FIL_PAGE_FILE_FLUSH_LSN in the first page is
+ too high */
+{
+ os_file_t file;
+ char* filepath;
+ byte* page;
+ dulint flush_lsn;
+ ulint space_id;
+ ib_longlong file_size;
+ ib_longlong offset;
+ ulint page_no;
+ ibool success;
+
+ filepath = ut_malloc(OS_FILE_MAX_PATH);
+
+ ut_a(strlen(name) < OS_FILE_MAX_PATH - 10);
+
+ sprintf(filepath, "%s/%s.ibd", fil_path_to_mysql_datadir, name);
+
+ srv_normalize_path_for_win(filepath);
+
+ file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN,
+ OS_FILE_READ_WRITE, &success);
+ if (!success) {
+ ut_free(filepath);
+
+ return(FALSE);
+ }
+
+ /* Read the first page of the tablespace */
+
+ page = ut_malloc(UNIV_PAGE_SIZE);
+
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ /* We have to read the file flush lsn from the header of the file */
+
+ flush_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
+
+ if (ut_dulint_cmp(current_lsn, flush_lsn) >= 0) {
+ /* Ok */
+ success = TRUE;
+
+ goto func_exit;
+ }
+
+ space_id = fsp_header_get_space_id(page);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Flush lsn in the tablespace file %lu to be imported\n"
+"InnoDB: is %lu %lu, which exceeds current system lsn %lu %lu.\n"
+"InnoDB: We reset the lsn's in the file %s.\n",
+ (ulong) space_id,
+ (ulong) ut_dulint_get_high(flush_lsn),
+ (ulong) ut_dulint_get_low(flush_lsn),
+ (ulong) ut_dulint_get_high(current_lsn),
+ (ulong) ut_dulint_get_low(current_lsn), filepath);
+
+ /* Loop through all the pages in the tablespace and reset the lsn and
+ the page checksum if necessary */
+
+ file_size = os_file_get_size_as_iblonglong(file);
+
+ for (offset = 0; offset < file_size; offset += UNIV_PAGE_SIZE) {
+ success = os_file_read(file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+ if (ut_dulint_cmp(mach_read_from_8(page + FIL_PAGE_LSN),
+ current_lsn) > 0) {
+ /* We have to reset the lsn */
+ space_id = mach_read_from_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ buf_flush_init_for_writing(page, current_lsn, space_id,
+ page_no);
+ success = os_file_write(filepath, file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+ }
+ }
+
+ success = os_file_flush(file);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ /* We now update the flush_lsn stamp at the start of the file */
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
+
+ success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+ success = os_file_flush(file);
+func_exit:
+ os_file_close(file);
+ ut_free(page);
+ ut_free(filepath);
+
+ return(success);
+}
+
+/************************************************************************
+Tries to open a single-table tablespace and checks the space id is right in
+it. If does not succeed, prints an error message to the .err log. This
+function is used to open the tablespace when we load a table definition
+to the dictionary cache. NOTE that we assume this operation is used under the
+protection of the dictionary mutex, so that two users cannot race here. This
+operation does not leave the file associated with the tablespace open, but
+closes it after we have looked at the space id in it. */
+
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+ /* out: TRUE if success */
+ ulint id, /* in: space id */
+ char* name) /* in: table name in the databasename/tablename
+ format */
+{
+ os_file_t file;
+ char* filepath;
+ ibool success;
+ byte* page;
+ ulint space_id;
+ ibool ret = TRUE;
+
+ filepath = ut_malloc(OS_FILE_MAX_PATH);
+
+ ut_a(strlen(name) < OS_FILE_MAX_PATH - 10);
+
+ sprintf(filepath, "%s/%s.ibd", fil_path_to_mysql_datadir, name);
+
+ srv_normalize_path_for_win(filepath);
+
+ file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN,
+ OS_FILE_READ_ONLY, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" InnoDB: Error: trying to open a table, but could not\n"
+"InnoDB: open the tablespace file %s!\n", filepath);
+ fprintf(stderr,
+"InnoDB: have you moved InnoDB .ibd files around without using the\n"
+"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"
+"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n"
+"InnoDB: how to resolve the issue.\n");
+
+ ut_free(filepath);
+
+ return(FALSE);
+ }
+
+ /* Read the first page of the tablespace */
+
+ page = ut_malloc(UNIV_PAGE_SIZE);
+
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ /* We have to read the tablespace id from the file */
+
+ space_id = fsp_header_get_space_id(page);
+
+ if (space_id != id) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" InnoDB: Error: tablespace id in file %s is %lu, but in the InnoDB\n"
+"InnoDB: data dictionary it is %lu.\n", filepath, (ulong) space_id, (ulong) id);
+ fprintf(stderr,
+"InnoDB: Have you moved InnoDB .ibd files around without using the\n"
+"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"
+"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n"
+"InnoDB: how to resolve the issue.\n");
+
+ ret = FALSE;
+
+ goto func_exit;
+ }
+
+ success = fil_space_create(filepath, space_id, FIL_TABLESPACE);
+
+ if (!success) {
+ goto func_exit;
+ }
+
+ /* We do not measure the size of the file, that is why we pass the 0
+ below */
+
+ fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+ os_file_close(file);
+ ut_free(page);
+ ut_free(filepath);
+
+ return(ret);
+}
+
+/************************************************************************
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.c data structures. */
+static
+void
+fil_load_single_table_tablespace(
+/*=============================*/
+ char* dbname, /* in: database name */
+ char* filename) /* in: file name (not a path), including the
+ .ibd extension */
{
+ os_file_t file;
+ char* filepath;
+ ibool success;
+ byte* page;
+ ulint space_id;
+ ulint size_low;
+ ulint size_high;
+ ib_longlong size;
+#ifdef UNIV_HOTBACKUP
+ fil_space_t* space;
+#endif
+ filepath = ut_malloc(OS_FILE_MAX_PATH);
+
+ ut_a(strlen(dbname) + strlen(filename)
+ + strlen(fil_path_to_mysql_datadir) < OS_FILE_MAX_PATH - 100);
+
+ sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname,
+ filename);
+ srv_normalize_path_for_win(filepath);
+
+ file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN,
+ OS_FILE_READ_ONLY, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ fprintf(stderr,
+"InnoDB: Error: could not open single-table tablespace file\n"
+"InnoDB: %s!", filepath);
+
+ ut_free(filepath);
+
+ return;
+ }
+
+ success = os_file_get_size(file, &size_low, &size_high);
+
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ fprintf(stderr,
+"InnoDB: Error: could not measure the size of single-table tablespace file\n"
+"InnoDB: %s!", filepath);
+
+ os_file_close(file);
+ ut_free(filepath);
+
+ return;
+ }
+
+ /* Every .ibd file is created >= 4 pages in size. Smaller files
+ cannot be ok. */
+
+ size = (((ib_longlong)size_high) << 32) + (ib_longlong)size_low;
+#ifndef UNIV_HOTBACKUP
+ if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+"InnoDB: Error: the size of single-table tablespace file %s\n"
+"InnoDB: is only %lu %lu, should be at least %lu!", filepath,
+ (ulong) size_high,
+ (ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE));
+ os_file_close(file);
+ ut_free(filepath);
+
+ return;
+ }
+#endif
+ /* Read the first page of the tablespace if the size big enough */
+
+ page = ut_malloc(UNIV_PAGE_SIZE);
+
+ if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ /* We have to read the tablespace id from the file */
+
+ space_id = fsp_header_get_space_id(page);
+ } else {
+ space_id = ULINT_UNDEFINED;
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (space_id == ULINT_UNDEFINED || space_id == 0) {
+ fprintf(stderr,
+"InnoDB: Error: tablespace id %lu in file %s is not sensible\n",
+ (ulong) space_id,
+ filepath);
+ goto func_exit;
+ }
+#else
+ if (space_id == ULINT_UNDEFINED || space_id == 0) {
+ char* new_path;
+
+ fprintf(stderr,
+"InnoDB: Renaming tablespace %s of id %lu,\n"
+"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+"InnoDB: because its size %lld is too small (< 4 pages 16 kB each),\n"
+"InnoDB: or the space id in the file header is not sensible.\n"
+"InnoDB: This can happen in an ibbackup run, and is not dangerous.\n",
+ filepath, space_id, filepath, size);
+ os_file_close(file);
+
+ new_path = ut_malloc(OS_FILE_MAX_PATH);
+
+ sprintf(new_path, "%s_ibbackup_old_vers_", filepath);
+ ut_sprintf_timestamp_without_extra_chars(
+ new_path + ut_strlen(new_path));
+ ut_a(os_file_rename(filepath, new_path));
+
+ ut_free(page);
+ ut_free(filepath);
+ ut_free(new_path);
+
+ return;
+ }
+
+ /* A backup may contain the same space several times, if the space got
+ renamed at a sensitive time. Since it is enough to have one version of
+ the space, we rename the file if a space with the same space id
+ already exists in the tablespace memory cache. We rather rename the
+ file than delete it, because if there is a bug, we do not want to
+ destroy valuable data. */
+
+ mutex_enter(&(fil_system->mutex));
+
+ space = fil_get_space_for_id_low(space_id);
+
+ if (space) {
+ char* new_path;
+
+ fprintf(stderr,
+"InnoDB: Renaming tablespace %s of id %lu,\n"
+"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+"InnoDB: because space %s with the same id\n"
+"InnoDB: was scanned earlier. This can happen if you have renamed tables\n"
+"InnoDB: during an ibbackup run.\n", filepath, space_id, filepath,
+ space->name);
+ os_file_close(file);
+
+ new_path = ut_malloc(OS_FILE_MAX_PATH);
+
+ sprintf(new_path, "%s_ibbackup_old_vers_", filepath);
+ ut_sprintf_timestamp_without_extra_chars(
+ new_path + ut_strlen(new_path));
+ mutex_exit(&(fil_system->mutex));
+
+ ut_a(os_file_rename(filepath, new_path));
+
+ ut_free(page);
+ ut_free(filepath);
+ ut_free(new_path);
+
+ return;
+ }
+ mutex_exit(&(fil_system->mutex));
+#endif
+ success = fil_space_create(filepath, space_id, FIL_TABLESPACE);
+
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ /* We do not measure the size of the file, that is why we pass the 0
+ below */
+
+ fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+ os_file_close(file);
+ ut_free(page);
+ ut_free(filepath);
+}
+
+/************************************************************************
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0. */
+
+ulint
+fil_load_single_table_tablespaces(void)
+/*===================================*/
+ /* out: DB_SUCCESS or error number */
+{
+ int ret;
+ char* dbpath;
+ os_file_dir_t dir;
+ os_file_dir_t dbdir;
+ os_file_stat_t dbinfo;
+ os_file_stat_t fileinfo;
+
+ /* The datadir of MySQL is always the default directory of mysqld */
+
+ dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE);
+
+ if (dir == NULL) {
+
+ return(DB_ERROR);
+ }
+
+ dbpath = ut_malloc(OS_FILE_MAX_PATH);
+
+ /* Scan all directories under the datadir. They are the database
+ directories of MySQL. */
+
+ ret = os_file_readdir_next_file(fil_path_to_mysql_datadir, dir,
+ &dbinfo);
+ while (ret == 0) {
+ /* printf("Looking at %s in datadir\n", dbinfo.name); */
+
+ if (dbinfo.type == OS_FILE_TYPE_FILE
+ || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
+
+ goto next_datadir_item;
+ }
+
+ /* We found a symlink or a directory; try opening it to see
+ if a symlink is a directory */
+
+ ut_a(strlen(dbinfo.name) < OS_FILE_MAX_PATH - 10);
+
+ sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir,
+ dbinfo.name);
+ srv_normalize_path_for_win(dbpath);
+
+ dbdir = os_file_opendir(dbpath, FALSE);
+
+ if (dbdir != NULL) {
+ /* printf("Opened dir %s\n", dbinfo.name); */
+
+ /* We found a database directory; loop through it,
+ looking for possible .ibd files in it */
+
+ ret = os_file_readdir_next_file(dbpath, dbdir,
+ &fileinfo);
+ while (ret == 0) {
+ /* printf(
+" Looking at file %s\n", fileinfo.name); */
+
+ if (fileinfo.type == OS_FILE_TYPE_DIR
+ || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
+ goto next_file_item;
+ }
+
+ /* We found a symlink or a file */
+ if (strlen(fileinfo.name) > 4
+ && 0 == strcmp(fileinfo.name +
+ strlen(fileinfo.name) - 4,
+ ".ibd")) {
+ /* The name ends in .ibd; try opening
+ the file */
+ fil_load_single_table_tablespace(
+ dbinfo.name, fileinfo.name);
+ }
+next_file_item:
+ ret = os_file_readdir_next_file(dbpath, dbdir,
+ &fileinfo);
+ }
+
+ if (0 != os_file_closedir(dbdir)) {
+ fprintf(stderr,
+"InnoDB: Warning: could not close database directory %s\n", dbpath);
+ }
+ }
+
+next_datadir_item:
+ ret = os_file_readdir_next_file(fil_path_to_mysql_datadir,
+ dir, &dbinfo);
+ }
+
+ ut_free(dbpath);
+
+ /* At the end of directory we should get 1 as the return value, -1
+ if there was an error */
+ if (ret != 1) {
+ fprintf(stderr,
+"InnoDB: Error: os_file_readdir_next_file returned %d in MySQL datadir\n",
+ ret);
+ os_file_closedir(dir);
+
+ return(DB_ERROR);
+ }
+
+ if (0 != os_file_closedir(dir)) {
+ fprintf(stderr,
+"InnoDB: Error: could not close MySQL datadir\n");
+
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/************************************************************************
+If we need crash recovery, and we have called
+fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(),
+we can call this function to print an error message of orphaned .ibd files
+for which there is not a data dictionary entry with a matching table name
+and space id. */
+
+void
+fil_print_orphaned_tablespaces(void)
+/*================================*/
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_FIRST(system->space_list);
+
+ while (space) {
+ if (space->purpose == FIL_TABLESPACE && space->id != 0
+ && !space->mark) {
+ fprintf(stderr,
+"InnoDB: Warning: tablespace %s of id %lu has no matching table in\n"
+"InnoDB: the InnoDB data dictionary.\n", space->name, (ulong) space->id);
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/***********************************************************************
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there. */
+
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+ /* out: TRUE if does not exist or is being\
+ deleted */
+ ulint id, /* in: space id */
+ ib_longlong version)/* in: tablespace_version should be this; if
+ you pass -1 as the value of this, then this
+ parameter is ignored */
+{
+ fil_system_t* system = fil_system;
fil_space_t* space;
- fil_system_t* system = fil_system;
- ulint size;
ut_ad(system);
@@ -729,29 +2892,36 @@ fil_space_get_size(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
- size = space->size;
-
+ if (space == NULL || space->is_being_deleted) {
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+ }
+
+ if (version != ((ib_longlong)-1)
+ && space->tablespace_version != version) {
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+ }
+
mutex_exit(&(system->mutex));
- return(size);
+ return(FALSE);
}
/***********************************************************************
-Checks if the pair space, page_no refers to an existing page in a
-tablespace file space. */
+Returns TRUE if a single-table tablespace exists in the memory cache. */
ibool
-fil_check_adress_in_tablespace(
-/*===========================*/
- /* out: TRUE if the address is meaningful */
- ulint id, /* in: space id */
- ulint page_no)/* in: page number */
+fil_tablespace_exists_in_mem(
+/*=========================*/
+ /* out: TRUE if exists */
+ ulint id) /* in: space id */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
- ulint size;
- ibool ret;
-
+ fil_space_t* space;
+
ut_ad(system);
mutex_enter(&(system->mutex));
@@ -759,23 +2929,356 @@ fil_check_adress_in_tablespace(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
if (space == NULL) {
- ret = FALSE;
- } else {
- size = space->size;
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache. */
+
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+ /* out: TRUE if a matching tablespace exists
+ in the memory cache */
+ ulint id, /* in: space id */
+ char* name, /* in: table name in the standard
+ 'databasename/tablename' format */
+ ibool mark_space, /* in: in crash recovery, at database startup
+ we mark all spaces which have an associated
+ table in the InnoDB data dictionary, so that
+ we can print a warning about orphaned
+ tablespaces */
+ ibool print_error_if_does_not_exist)
+ /* in: print detailed error information to
+ the .err log if a matching tablespace is
+ not found from memory */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* namespace;
+ fil_space_t* space;
+ char path[OS_FILE_MAX_PATH];
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ sprintf(path, "%s/%s.ibd", fil_path_to_mysql_datadir, name);
+ srv_normalize_path_for_win(path);
+
+ /* Look if there is a space with the same id */
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ /* Look if there is a space with the same name; the name is the
+ directory path from the datadir to the file */
+
+ HASH_SEARCH(name_hash, system->name_hash,
+ ut_fold_string(path), namespace,
+ 0 == strcmp(namespace->name, path));
+ if (space && space == namespace) {
+ /* Found */
+
+ if (mark_space) {
+ space->mark = TRUE;
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+ }
+
+ if (!print_error_if_does_not_exist) {
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
- if (page_no > size) {
- ret = FALSE;
- } else if (space->purpose != FIL_TABLESPACE) {
- ret = FALSE;
+ if (space == NULL) {
+ if (namespace == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: table %s\n"
+"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n"
+"InnoDB: but tablespace with that id or name does not exist. Have\n"
+"InnoDB: you deleted or moved .ibd files?\n",
+ name, (ulong) id);
} else {
- ret = TRUE;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: table %s\n"
+"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n"
+"InnoDB: but tablespace with that id does not exist. There is\n"
+"InnoDB: a tablespace of name %s and id %lu, though. Have\n"
+"InnoDB: you deleted or moved .ibd files?\n",
+ name, (ulong) id, namespace->name,
+ (ulong) namespace->id);
+ }
+ fprintf(stderr,
+"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n"
+"InnoDB: how to resolve the issue.\n");
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ if (0 != strcmp(space->name, path)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: table %s\n"
+"InnoDB: in InnoDB data dictionary has tablespace id %lu,\n"
+"InnoDB: but tablespace with that id has name %s.\n"
+"InnoDB: Have you deleted or moved .ibd files?\n", name, (ulong) id, space->name);
+
+ if (namespace != NULL) {
+ fprintf(stderr,
+"InnoDB: There is a tablespace with the right name\n"
+"InnoDB: %s, but its id is %lu.\n", namespace->name, (ulong) namespace->id);
}
+
+ fprintf(stderr,
+"InnoDB: You can look from section 15.1 of http://www.innodb.com/ibman.html\n"
+"InnoDB: how to resolve the issue.\n");
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(FALSE);
+}
+
+/***********************************************************************
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache. */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+ /* out: space id, ULINT_UNDEFINED if not
+ found */
+ char* name) /* in: table name in the standard
+ 'databasename/tablename' format */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* namespace;
+ ulint id = ULINT_UNDEFINED;
+ char path[OS_FILE_MAX_PATH];
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ sprintf(path, "%s/%s.ibd", fil_path_to_mysql_datadir, name);
+ srv_normalize_path_for_win(path);
+
+ /* Look if there is a space with the same name; the name is the
+ directory path to the file */
+
+ HASH_SEARCH(name_hash, system->name_hash,
+ ut_fold_string(path), namespace,
+ 0 == strcmp(namespace->name, path));
+ if (namespace) {
+ id = namespace->id;
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(id);
+}
+
+/**************************************************************************
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing. */
+
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+ /* out: TRUE if success */
+ ulint* actual_size, /* out: size of the space after extension;
+ if we ran out of disk space this may be lower
+ than the desired size */
+ ulint space_id, /* in: space id, must be != 0 */
+ ulint size_after_extend)/* in: desired size in pages after the
+ extension; if the current space size is bigger
+ than this already, the function does nothing */
+{
+ fil_system_t* system = fil_system;
+ fil_node_t* node;
+ fil_space_t* space;
+ byte* buf2;
+ byte* buf;
+ ulint start_page_no;
+ ulint file_start_page_no;
+ ulint n_pages;
+ ulint offset_high;
+ ulint offset_low;
+ ibool success = TRUE;
+
+ fil_mutex_enter_and_prepare_for_io(space_id);
+
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ ut_a(space);
+
+ if (space->size >= size_after_extend) {
+ /* Space already big enough */
+
+ *actual_size = space->size;
+
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
}
+ node = UT_LIST_GET_LAST(space->chain);
+
+ fil_node_prepare_for_io(node, system, space);
+
+ /* Extend 1 MB at a time */
+
+ buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE);
+ buf = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ memset(buf, '\0', 1024 * 1024);
+
+ start_page_no = space->size;
+ file_start_page_no = space->size - node->size;
+
+ while (start_page_no < size_after_extend) {
+ n_pages = size_after_extend - start_page_no;
+
+ if (n_pages > (1024 * 1024) / UNIV_PAGE_SIZE) {
+ n_pages = (1024 * 1024) / UNIV_PAGE_SIZE;
+ }
+
+ offset_high = (start_page_no - file_start_page_no)
+ / (4096 * ((1024 * 1024) / UNIV_PAGE_SIZE));
+ offset_low = ((start_page_no - file_start_page_no)
+ % (4096 * ((1024 * 1024) / UNIV_PAGE_SIZE)))
+ * UNIV_PAGE_SIZE;
+#ifdef UNIV_HOTBACKUP
+ success = os_file_write(node->name, node->handle, buf,
+ offset_low, offset_high,
+ UNIV_PAGE_SIZE * n_pages);
+#else
+ success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+ node->name, node->handle, buf,
+ offset_low, offset_high,
+ UNIV_PAGE_SIZE * n_pages,
+ NULL, NULL);
+#endif
+ if (success) {
+ node->size += n_pages;
+ space->size += n_pages;
+
+ os_has_said_disk_full = FALSE;
+ } else {
+ /* Let us measure the size of the file to determine
+ how much we were able to extend it */
+
+ n_pages = ((ulint)
+ (os_file_get_size_as_iblonglong(node->handle)
+ / UNIV_PAGE_SIZE)) - node->size;
+
+ node->size += n_pages;
+ space->size += n_pages;
+
+ break;
+ }
+
+ start_page_no += n_pages;
+ }
+
+ mem_free(buf2);
+
+ fil_node_complete_io(node, system, OS_FILE_WRITE);
+
+ *actual_size = space->size;
+ /*
+ printf("Extended %s to %lu, actual size %lu pages\n", space->name,
+ size_after_extend, *actual_size); */
+ mutex_exit(&(system->mutex));
+
+ fil_flush(space_id);
+
+ return(success);
+}
+
+#ifdef UNIV_HOTBACKUP
+/************************************************************************
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be appllied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+
+void
+fil_extend_tablespaces_to_stored_len(void)
+/*======================================*/
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ byte* buf;
+ ulint actual_size;
+ ulint size_in_header;
+ ulint error;
+ ibool success;
+
+ buf = mem_alloc(UNIV_PAGE_SIZE);
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_FIRST(system->space_list);
+
+ while (space) {
+ ut_a(space->purpose == FIL_TABLESPACE);
+
+ mutex_exit(&(system->mutex)); /* no need to protect with a
+ mutex, because this is a single-
+ threaded operation */
+ error = fil_read(TRUE, space->id, 0, 0, UNIV_PAGE_SIZE, buf,
+ NULL);
+ ut_a(error == DB_SUCCESS);
+
+ size_in_header = fsp_get_size_low(buf);
+
+ success = fil_extend_space_to_desired_size(&actual_size,
+ space->id, size_in_header);
+ if (!success) {
+ fprintf(stderr,
+"InnoDB: Error: could not extend the tablespace of %s\n"
+"InnoDB: to the size stored in header, %lu pages;\n"
+"InnoDB: size after extension %lu pages\n"
+"InnoDB: Check that you have free disk space and retry!\n", space->name,
+ size_in_header, actual_size);
+ exit(1);
+ }
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
mutex_exit(&(system->mutex));
- return(ret);
+ mem_free(buf);
}
+#endif
+
+/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
/***********************************************************************
Tries to reserve free extents in a file space. */
@@ -788,8 +3291,8 @@ fil_space_reserve_free_extents(
ulint n_free_now, /* in: number of free extents now */
ulint n_to_reserve) /* in: how many one wants to reserve */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
+ fil_space_t* space;
ibool success;
ut_ad(system);
@@ -798,6 +3301,8 @@ fil_space_reserve_free_extents(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+ ut_a(space);
+
if (space->n_reserved_extents + n_to_reserve > n_free_now) {
success = FALSE;
} else {
@@ -819,8 +3324,8 @@ fil_space_release_free_extents(
ulint id, /* in: space id */
ulint n_reserved) /* in: how many one reserved */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
+ fil_space_t* space;
ut_ad(system);
@@ -828,6 +3333,7 @@ fil_space_release_free_extents(
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+ ut_a(space);
ut_a(space->n_reserved_extents >= n_reserved);
space->n_reserved_extents -= n_reserved;
@@ -844,8 +3350,8 @@ fil_space_get_n_reserved_extents(
/*=============================*/
ulint id) /* in: space id */
{
- fil_space_t* space;
fil_system_t* system = fil_system;
+ fil_space_t* space;
ulint n;
ut_ad(system);
@@ -863,208 +3369,99 @@ fil_space_get_n_reserved_extents(
return(n);
}
+/*============================ FILE I/O ================================*/
+
/************************************************************************
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
Prepares a file node for i/o. Opens the file if it is closed. Updates the
pending i/o's field in the node and the system appropriately. Takes the node
-off the LRU list if it is in the LRU list. */
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
static
void
fil_node_prepare_for_io(
/*====================*/
fil_node_t* node, /* in: file node */
- fil_system_t* system, /* in: file system */
+ fil_system_t* system, /* in: tablespace memory cache */
fil_space_t* space) /* in: space */
{
- ibool ret;
- fil_node_t* last_node;
-
ut_ad(node && system && space);
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(system->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ if (system->n_open > system->max_n_open + 5) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: open files %lu exceeds the limit %lu\n",
+ (ulong) system->n_open,
+ (ulong) system->max_n_open);
+ }
+
if (node->open == FALSE) {
- /* File is closed */
+ /* File is closed: open it */
ut_a(node->n_pending == 0);
- /* If too many files are open, close one */
-
- if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU)
- == system->max_n_open) {
-
- ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
-
- last_node = UT_LIST_GET_LAST(system->LRU);
-
- if (last_node == NULL) {
- fprintf(stderr,
- "InnoDB: Error: cannot close any file to open another for i/o\n"
- "InnoDB: Pending i/o's on %lu files exist\n",
- system->n_open_pending);
-
- ut_error;
- }
-
- fil_node_close(last_node, system);
- }
-
- if (space->purpose == FIL_LOG) {
- node->handle = os_file_create(node->name, OS_FILE_OPEN,
- OS_FILE_AIO, OS_LOG_FILE, &ret);
- } else {
- node->handle = os_file_create(node->name, OS_FILE_OPEN,
- OS_FILE_AIO, OS_DATA_FILE, &ret);
- }
-
- ut_a(ret);
-
- node->open = TRUE;
-
- system->n_open_pending++;
- node->n_pending = 1;
-
- /* File was closed: the node was not in the LRU list */
-
- return;
+ fil_node_open_file(node, system, space);
}
- /* File is open */
- if (node->n_pending == 0) {
+ if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
+ && space->id != 0) {
/* The node is in the LRU list, remove it */
- UT_LIST_REMOVE(LRU, system->LRU, node);
-
- system->n_open_pending++;
- node->n_pending = 1;
- } else {
- /* There is already a pending i/o-op on the file: the node is
- not in the LRU list */
+ ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
- node->n_pending++;
+ UT_LIST_REMOVE(LRU, system->LRU, node);
}
+
+ node->n_pending++;
}
/************************************************************************
Updates the data structures when an i/o operation finishes. Updates the
-pending i/os field in the node and the system appropriately. Puts the node
-in the LRU list if there are no other pending i/os. */
+pending i/o's field in the node appropriately. */
static
void
fil_node_complete_io(
/*=================*/
fil_node_t* node, /* in: file node */
- fil_system_t* system, /* in: file system */
- ulint type) /* in: OS_FILE_WRITE or ..._READ */
+ fil_system_t* system, /* in: tablespace memory cache */
+ ulint type) /* in: OS_FILE_WRITE or OS_FILE_READ; marks
+ the node as modified if
+ type == OS_FILE_WRITE */
{
ut_ad(node);
ut_ad(system);
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(system->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+
ut_a(node->n_pending > 0);
node->n_pending--;
- if (type != OS_FILE_READ) {
- node->is_modified = TRUE;
+ if (type == OS_FILE_WRITE) {
+ system->modification_counter++;
+ node->modification_counter = system->modification_counter;
}
- if (node->n_pending == 0) {
+ if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
+ && node->space->id != 0) {
/* The node must be put back to the LRU list */
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
-
- ut_a(system->n_open_pending > 0);
-
- system->n_open_pending--;
-
- if (system->n_open_pending == system->max_n_open - 1) {
-
- os_event_set(system->can_open);
- }
- }
-}
-
-/**************************************************************************
-Tries to extend a data file by the number of pages given. Any fractions of a
-megabyte are ignored. */
-
-ibool
-fil_extend_last_data_file(
-/*======================*/
- /* out: TRUE if success, also if we run
- out of disk space we may return TRUE */
- ulint* actual_increase,/* out: number of pages we were able to
- extend, here the orginal size of the file and
- the resulting size of the file are rounded
- downwards to a full megabyte, and the
- difference expressed in pages is returned */
- ulint size_increase) /* in: try to extend this many pages */
-{
- fil_node_t* node;
- fil_space_t* space;
- fil_system_t* system = fil_system;
- byte* buf2;
- byte* buf;
- ibool success;
- ulint i;
-
- mutex_enter(&(system->mutex));
-
- HASH_SEARCH(hash, system->spaces, 0, space, space->id == 0);
-
- ut_a(space);
-
- node = UT_LIST_GET_LAST(space->chain);
-
- fil_node_prepare_for_io(node, system, space);
-
- buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE);
- buf = ut_align(buf2, UNIV_PAGE_SIZE);
-
- memset(buf, '\0', 1024 * 1024);
-
- for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) {
-
- /* If we use native Windows aio, then also this write is
- done using it */
-
- success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
- node->name, node->handle, buf,
- (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF,
- node->size >> (32 - UNIV_PAGE_SIZE_SHIFT),
- 1024 * 1024, NULL, NULL);
-
- if (!success) {
- break;
- }
-
- node->size += ((1024 * 1024) / UNIV_PAGE_SIZE);
- space->size += ((1024 * 1024) / UNIV_PAGE_SIZE);
-
- os_has_said_disk_full = FALSE;
}
-
- mem_free(buf2);
-
- fil_node_complete_io(node, system, OS_FILE_WRITE);
-
- mutex_exit(&(system->mutex));
-
- *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE);
-
- fil_flush(0);
-
- srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase;
-
- return(TRUE);
}
/************************************************************************
Reads or writes data. This operation is asynchronous (aio). */
-void
+ulint
fil_io(
/*===*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE,
ORed to OS_FILE_LOG, if a log i/o
and ORed to OS_AIO_SIMULATED_WAKE_LATER
@@ -1089,17 +3486,15 @@ fil_io(
void* message) /* in: message for aio handler if non-sync
aio used, else ignored */
{
+ fil_system_t* system = fil_system;
ulint mode;
fil_space_t* space;
fil_node_t* node;
ulint offset_high;
ulint offset_low;
- fil_system_t* system;
- os_event_t event;
ibool ret;
ulint is_log;
ulint wake_later;
- ulint count;
is_log = type & OS_FILE_LOG;
type = type & ~OS_FILE_LOG;
@@ -1110,7 +3505,7 @@ fil_io(
ut_ad(byte_offset < UNIV_PAGE_SIZE);
ut_ad(buf);
ut_ad(len > 0);
- ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE);
+ ut_a((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE);
ut_ad(fil_validate());
#ifndef UNIV_LOG_DEBUG
/* ibuf bitmap pages must be read in the sync aio mode: */
@@ -1132,82 +3527,47 @@ fil_io(
mode = OS_AIO_NORMAL;
}
- system = fil_system;
+ /* Reserve the fil_system mutex and make sure that we can open at
+ least one file while holding it, if the file is not already open */
- count = 0;
-loop:
- count++;
-
- /* NOTE that there is a possibility of a hang here:
- if the read i/o-handler thread needs to complete
- a read by reading from the insert buffer, it may need to
- post another read. But if the maximum number of files
- are already open, it cannot proceed from here! */
-
- mutex_enter(&(system->mutex));
+ fil_mutex_enter_and_prepare_for_io(space_id);
- if (count < 500 && !is_log && !ibuf_inside()
- && system->n_open_pending >= (3 * system->max_n_open) / 4) {
-
- /* We are not doing an ibuf operation: leave a
- safety margin of openable files for possible ibuf
- merges needed in page read completion */
-
- mutex_exit(&(system->mutex));
-
- /* Wake the i/o-handler threads to make sure pending
- i/o's are handled and eventually we can open the file */
-
- os_aio_simulated_wake_handler_threads();
-
- os_thread_sleep(100000);
-
- if (count > 50) {
- fprintf(stderr,
- "InnoDB: Warning: waiting for file closes to proceed\n"
- "InnoDB: round %lu\n", count);
- }
-
- goto loop;
- }
-
- if (system->n_open_pending == system->max_n_open) {
-
- /* It is not sure we can open the file if it is closed: wait */
-
- event = system->can_open;
- os_event_reset(event);
-
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ if (!space) {
mutex_exit(&(system->mutex));
- /* Wake the i/o-handler threads to make sure pending
- i/o's are handled and eventually we can open the file */
-
- os_aio_simulated_wake_handler_threads();
-
+ ut_print_timestamp(stderr);
fprintf(stderr,
- "InnoDB: Warning: max allowed number of files is open\n");
+" InnoDB: Error: trying to do i/o to a tablespace which does not exist.\n"
+"InnoDB: i/o type %lu, space id %lu, page no. %lu, i/o length %lu bytes\n",
+ (ulong) type, (ulong) space_id, (ulong) block_offset,
+ (ulong) len);
- os_event_wait(event);
-
- goto loop;
- }
-
- HASH_SEARCH(hash, system->spaces, space_id, space,
- space->id == space_id);
- ut_a(space);
+ return(DB_TABLESPACE_DELETED);
+ }
ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
node = UT_LIST_GET_FIRST(space->chain);
for (;;) {
+ if (space->id != 0 && node->size == 0) {
+ /* We do not know the size of a single-table tablespace
+ before we open the file */
+
+ break;
+ }
+
if (node == NULL) {
fprintf(stderr,
- "InnoDB: Error: trying to access page number %lu in space %lu\n"
+ "InnoDB: Error: trying to access page number %lu in space %lu,\n"
+ "InnoDB: space name %s,\n"
"InnoDB: which is outside the tablespace bounds.\n"
"InnoDB: Byte offset %lu, len %lu, i/o type %lu\n",
- block_offset, space_id, byte_offset, len, type);
+ (ulong) block_offset, (ulong) space_id,
+ space->name, (ulong) byte_offset, (ulong) len,
+ (ulong) type);
ut_error;
}
@@ -1224,13 +3584,29 @@ loop:
/* Open file if closed */
fil_node_prepare_for_io(node, system, space);
+ /* Check that at least the start offset is within the bounds of a
+ single-table tablespace */
+ if (space->purpose == FIL_TABLESPACE && space->id != 0
+ && node->size <= block_offset) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to access page number %lu in space %lu,\n"
+ "InnoDB: space name %s,\n"
+ "InnoDB: which is outside the tablespace bounds.\n"
+ "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n",
+ (ulong) block_offset, (ulong) space_id,
+ space->name, (ulong) byte_offset, (ulong) len,
+ (ulong) type);
+ ut_a(0);
+ }
+
/* Now we have made the changes in the data structures of system */
mutex_exit(&(system->mutex));
/* Calculate the low 32 bits and the high 32 bits of the file offset */
offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
- offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF)
+ offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL)
+ byte_offset;
ut_a(node->size - block_offset >=
@@ -1241,9 +3617,20 @@ loop:
ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+#ifdef UNIV_HOTBACKUP
+ /* In ibbackup do normal i/o, not aio */
+ if (type == OS_FILE_READ) {
+ ret = os_file_read(node->handle, buf, offset_low, offset_high,
+ len);
+ } else {
+ ret = os_file_write(node->name, node->handle, buf,
+ offset_low, offset_high, len);
+ }
+#else
/* Queue the aio request */
ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
offset_low, offset_high, len, node, message);
+#endif
ut_a(ret);
if (mode == OS_AIO_SYNC) {
@@ -1258,6 +3645,8 @@ loop:
ut_ad(fil_validate());
}
+
+ return(DB_SUCCESS);
}
/************************************************************************
@@ -1265,9 +3654,12 @@ Reads data from a space to a buffer. Remember that the possible incomplete
blocks at the end of file are ignored: they are not taken into account when
calculating the byte offset within a space. */
-void
+ulint
fil_read(
/*=====*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint space_id, /* in: space id */
ulint block_offset, /* in: offset in number of blocks */
@@ -1281,8 +3673,8 @@ fil_read(
void* message) /* in: message for aio handler if non-sync
aio used, else ignored */
{
- fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len,
- buf, message);
+ return(fil_io(OS_FILE_READ, sync, space_id, block_offset,
+ byte_offset, len, buf, message));
}
/************************************************************************
@@ -1290,9 +3682,12 @@ Writes data to a space from a buffer. Remember that the possible incomplete
blocks at the end of file are ignored: they are not taken into account when
calculating the byte offset within a space. */
-void
+ulint
fil_write(
/*======*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint space_id, /* in: space id */
ulint block_offset, /* in: offset in number of blocks */
@@ -1306,8 +3701,8 @@ fil_write(
void* message) /* in: message for aio handler if non-sync
aio used, else ignored */
{
- fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len,
- buf, message);
+ return(fil_io(OS_FILE_WRITE, sync, space_id, block_offset,
+ byte_offset, len, buf, message));
}
/**************************************************************************
@@ -1322,19 +3717,19 @@ fil_aio_wait(
ulint segment) /* in: the number of the segment in the aio
array to wait for */
{
+ fil_system_t* system = fil_system;
ibool ret;
fil_node_t* fil_node;
- fil_system_t* system = fil_system;
void* message;
ulint type;
ut_ad(fil_validate());
if (os_aio_use_native_aio) {
- srv_io_thread_op_info[segment] = (char *) "native aio handle";
+ srv_io_thread_op_info[segment] = (char *) "handle native aio";
#ifdef WIN_ASYNC_IO
- ret = os_aio_windows_handle(segment, 0, &fil_node, &message,
- &type);
+ ret = os_aio_windows_handle(segment, 0, (void**) &fil_node,
+ &message, &type);
#elif defined(POSIX_ASYNC_IO)
ret = os_aio_posix_handle(segment, &fil_node, &message);
#else
@@ -1342,7 +3737,7 @@ fil_aio_wait(
ut_error;
#endif
} else {
- srv_io_thread_op_info[segment] =(char *)"simulated aio handle";
+ srv_io_thread_op_info[segment] =(char *)"handle simulated aio";
ret = os_aio_simulated_handle(segment, (void**) &fil_node,
&message, &type);
@@ -1361,6 +3756,10 @@ fil_aio_wait(
ut_ad(fil_validate());
/* Do the i/o handling */
+ /* IMPORTANT: since i/o handling for reads will read also the insert
+ buffer in tablespace 0, you have to be very careful not to introduce
+ deadlocks in the i/o system. We keep tablespace 0 data files always
+ open, and use a special i/o thread to serve insert buffer requests. */
if (buf_pool_is_block(message)) {
srv_io_thread_op_info[segment] =
@@ -1373,7 +3772,8 @@ fil_aio_wait(
}
/**************************************************************************
-Flushes to disk possible writes cached by the OS. */
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
void
fil_flush(
@@ -1385,41 +3785,79 @@ fil_flush(
fil_space_t* space;
fil_node_t* node;
os_file_t file;
+ ib_longlong old_mod_counter;
mutex_enter(&(system->mutex));
HASH_SEARCH(hash, system->spaces, space_id, space,
- space->id == space_id);
- ut_a(space);
+ space->id == space_id);
+ if (!space || space->is_being_deleted) {
+ mutex_exit(&(system->mutex));
+
+ return;
+ }
+ space->n_pending_flushes++; /* prevent dropping of the space while
+ we are flushing */
node = UT_LIST_GET_FIRST(space->chain);
while (node) {
- if (node->open && node->is_modified) {
- file = node->handle;
+ if (node->modification_counter > node->flush_counter) {
+ ut_a(node->open);
+
+ /* We want to flush the changes at least up to
+ old_mod_counter */
+ old_mod_counter = node->modification_counter;
- node->is_modified = FALSE;
-
if (space->purpose == FIL_TABLESPACE) {
fil_n_pending_tablespace_flushes++;
} else {
fil_n_pending_log_flushes++;
}
+#ifdef __WIN__
+ if (node->is_raw_disk) {
- mutex_exit(&(system->mutex));
+ goto skip_flush;
+ }
+#endif
+retry:
+ if (node->n_pending_flushes > 0) {
+ /* We want to avoid calling os_file_flush() on
+ the file twice at the same time, because we do
+ not know what bugs OS's may contain in file
+ i/o; sleep for a while */
+
+ mutex_exit(&(system->mutex));
+
+ os_thread_sleep(20000);
+
+ mutex_enter(&(system->mutex));
+
+ if (node->flush_counter >= old_mod_counter) {
+
+ goto skip_flush;
+ }
+
+ goto retry;
+ }
+
+ ut_a(node->open);
+ file = node->handle;
+ node->n_pending_flushes++;
- /* Note that it is not certain, when we have
- released the mutex above, that the file of the
- handle is still open: we assume that the OS
- will not crash or trap even if we pass a handle
- to a closed file below in os_file_flush! */
+ mutex_exit(&(system->mutex));
/* printf("Flushing to file %s\n", node->name); */
-
- os_file_flush(file);
-
+ os_file_flush(file);
+
mutex_enter(&(system->mutex));
+ node->n_pending_flushes--;
+skip_flush:
+ if (node->flush_counter < old_mod_counter) {
+ node->flush_counter = old_mod_counter;
+ }
+
if (space->purpose == FIL_TABLESPACE) {
fil_n_pending_tablespace_flushes--;
} else {
@@ -1430,11 +3868,13 @@ fil_flush(
node = UT_LIST_GET_NEXT(chain, node);
}
+ space->n_pending_flushes--;
+
mutex_exit(&(system->mutex));
}
/**************************************************************************
-Flushes to disk writes in file spaces of the given type possibly cached by
+Flushes to disk the writes in file spaces of the given type possibly cached by
the OS. */
void
@@ -1451,13 +3891,17 @@ fil_flush_file_spaces(
while (space) {
if (space->purpose == purpose) {
+ space->n_pending_flushes++; /* prevent dropping of the
+ space while we are
+ flushing */
mutex_exit(&(system->mutex));
fil_flush(space->id);
mutex_enter(&(system->mutex));
- }
+ space->n_pending_flushes--;
+ }
space = UT_LIST_GET_NEXT(space_list, space);
}
@@ -1465,20 +3909,18 @@ fil_flush_file_spaces(
}
/**********************************************************************
-Checks the consistency of the file system. */
+Checks the consistency of the tablespace cache. */
ibool
fil_validate(void)
/*==============*/
/* out: TRUE if ok */
{
+ fil_system_t* system = fil_system;
fil_space_t* space;
fil_node_t* fil_node;
- ulint pending_count = 0;
- fil_system_t* system;
+ ulint n_open = 0;
ulint i;
-
- system = fil_system;
mutex_enter(&(system->mutex));
@@ -1489,36 +3931,35 @@ fil_validate(void)
space = HASH_GET_FIRST(system->spaces, i);
while (space != NULL) {
-
UT_LIST_VALIDATE(chain, fil_node_t, space->chain);
fil_node = UT_LIST_GET_FIRST(space->chain);
while (fil_node != NULL) {
-
if (fil_node->n_pending > 0) {
-
- pending_count++;
ut_a(fil_node->open);
}
+ if (fil_node->open) {
+ n_open++;
+ }
fil_node = UT_LIST_GET_NEXT(chain, fil_node);
}
-
space = HASH_GET_NEXT(hash, space);
}
}
- ut_a(pending_count == system->n_open_pending);
+ ut_a(system->n_open == n_open);
UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU);
fil_node = UT_LIST_GET_FIRST(system->LRU);
while (fil_node != NULL) {
-
ut_a(fil_node->n_pending == 0);
ut_a(fil_node->open);
+ ut_a(fil_node->space->purpose == FIL_TABLESPACE);
+ ut_a(fil_node->space->id != 0);
fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
}
@@ -1586,4 +4027,4 @@ fil_page_get_type(
ut_ad(page);
return(mach_read_from_2(page + FIL_PAGE_TYPE));
-}
+}
diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c
index 49885df07d7..9be6e1a6e50 100644
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -27,6 +27,10 @@ Created 11/29/1995 Heikki Tuuri
#include "dict0mem.h"
#include "log0log.h"
+
+#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header
+ within a file page */
+
/* The data structures in files are defined just as byte strings in C */
typedef byte fsp_header_t;
typedef byte xdes_t;
@@ -38,10 +42,9 @@ File space header data structure: this data structure is contained in the
first page of a space. The space for this header is reserved in every extent
descriptor page, but used only in the first. */
-#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header
- within a file page */
/*-------------------------------------*/
-#define FSP_NOT_USED 0 /* this field contained a value up to
+#define FSP_SPACE_ID 0 /* space id */
+#define FSP_NOT_USED 4 /* this field contained a value up to
which we know that the modifications
in the database have been flushed to
the file space; not used now */
@@ -50,7 +53,13 @@ descriptor page, but used only in the first. */
#define FSP_FREE_LIMIT 12 /* Minimum page number for which the
free list has not been initialized:
the pages >= this limit are, by
- definition, free */
+ definition, free; note that in a
+ single-table tablespace where size
+ < 64 pages, this number is 64, i.e.,
+ we have initialized the space
+ about the first extent, but have not
+ physically allocted those pages to the
+ file */
#define FSP_LOWEST_NO_WRITE 16 /* The lowest page offset for which
the page has not been written to disk
(if it has been written, we know that
@@ -83,7 +92,6 @@ descriptor page, but used only in the first. */
#define FSP_FREE_ADD 4 /* this many free extents are added
to the free list from above
FSP_FREE_LIMIT at a time */
-
/* FILE SEGMENT INODE
==================
@@ -263,9 +271,14 @@ static
void
fsp_fill_free_list(
/*===============*/
- ulint space, /* in: space */
- fsp_header_t* header, /* in: space header */
- mtr_t* mtr); /* in: mtr */
+ ibool init_space, /* in: TRUE if this is a single-table
+ tablespace and we are only initing
+ the tablespace's first extent
+ descriptor page and ibuf bitmap page;
+ then we do not allocate more extents */
+ ulint space, /* in: space */
+ fsp_header_t* header, /* in: space header */
+ mtr_t* mtr); /* in: mtr */
/**************************************************************************
Allocates a single free page from a segment. This function implements
the intelligent allocation strategy which tries to minimize file space
@@ -286,6 +299,19 @@ fseg_alloc_free_page_low(
FSP_UP, FSP_NO_DIR */
mtr_t* mtr); /* in: mtr handle */
+
+/**************************************************************************
+Reads the file space size stored in the header page. */
+
+ulint
+fsp_get_size_low(
+/*=============*/
+ /* out: tablespace size stored in the space header */
+ page_t* page) /* in: header page (page 0 in the tablespace) */
+{
+ return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE));
+}
+
/**************************************************************************
Gets a pointer to the space header and x-locks its page. */
UNIV_INLINE
@@ -569,7 +595,7 @@ xdes_init(
ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0);
for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) {
- mlog_write_ulint(descr + i, 0xFFFFFFFF, MLOG_4BYTES, mtr);
+ mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr);
}
xdes_set_state(descr, XDES_FREE, mtr);
@@ -630,8 +656,8 @@ xdes_get_descriptor_with_space_hdr(
page_t* descr_page;
ut_ad(mtr);
- ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK));
-
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space),
+ MTR_MEMO_X_LOCK));
/* Read free limit and space size */
limit = mtr_read_ulint(sp_header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
size = mtr_read_ulint(sp_header + FSP_SIZE, MLOG_4BYTES, mtr);
@@ -646,7 +672,7 @@ xdes_get_descriptor_with_space_hdr(
/* If offset is == limit, fill free list of the space. */
if (offset == limit) {
- fsp_fill_free_list(space, sp_header, mtr);
+ fsp_fill_free_list(FALSE, space, sp_header, mtr);
}
descr_page_no = xdes_calc_descriptor_page(offset);
@@ -714,8 +740,8 @@ xdes_lst_get_descriptor(
xdes_t* descr;
ut_ad(mtr);
- ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK));
-
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space),
+ MTR_MEMO_X_LOCK));
descr = fut_get_ptr(space, lst_node, RW_X_LATCH, mtr) - XDES_FLST_NODE;
return(descr);
@@ -825,8 +851,21 @@ fsp_init(void)
}
/**************************************************************************
+Writes the space id to a tablespace header. This function is used past the
+buffer pool when we in fil0fil.c create a new single-table tablespace. */
+
+void
+fsp_header_write_space_id(
+/*======================*/
+ page_t* page, /* in: first page in the space */
+ ulint space_id) /* in: space id */
+{
+ mach_write_to_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID, space_id);
+}
+
+/**************************************************************************
Initializes the space header of a new created space and creates also the
-insert buffer tree root. */
+insert buffer tree root if space == 0. */
void
fsp_header_init(
@@ -843,9 +882,6 @@ fsp_header_init(
mtr_x_lock(fil_space_get_latch(space), mtr);
page = buf_page_create(space, 0, mtr);
-#ifdef UNIV_SYNC_DEBUG
- buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
-#endif /* UNIV_SYNC_DEBUG */
buf_page_get(space, 0, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
@@ -857,6 +893,8 @@ fsp_header_init(
header = FSP_HEADER_OFFSET + page;
+ mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr);
+
mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr);
mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr);
mlog_write_ulint(header + FSP_LOWEST_NO_WRITE, 0, MLOG_4BYTES, mtr);
@@ -869,10 +907,40 @@ fsp_header_init(
flst_init(header + FSP_SEG_INODES_FREE, mtr);
mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr);
- fsp_fill_free_list(space, header, mtr);
-
- btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space,
+ if (space == 0) {
+ fsp_fill_free_list(FALSE, space, header, mtr);
+ btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space,
ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr);
+ } else {
+ fsp_fill_free_list(TRUE, space, header, mtr);
+ }
+}
+
+/**************************************************************************
+Reads the space id from the first page of a tablespace. */
+
+ulint
+fsp_header_get_space_id(
+/*====================*/
+ /* out: space id, ULINT UNDEFINED if error */
+ page_t* page) /* in: first page of a tablespace */
+{
+ ulint fsp_id;
+ ulint id;
+
+ fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID);
+
+ id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ if (id != fsp_id) {
+ fprintf(stderr,
+"InnoDB: Error: space id in fsp header %lu, but in the page header %lu\n",
+ (ulong) fsp_id,
+ (ulong) id);
+ return(ULINT_UNDEFINED);
+ }
+
+ return(id);
}
/**************************************************************************
@@ -896,7 +964,8 @@ fsp_header_inc_size(
size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
- mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES,
+ mtr);
}
/**************************************************************************
@@ -909,7 +978,7 @@ ulint
fsp_header_get_free_limit(
/*======================*/
/* out: free limit in megabytes */
- ulint space) /* in: space id */
+ ulint space) /* in: space id, must be 0 */
{
fsp_header_t* header;
ulint limit;
@@ -943,7 +1012,7 @@ ulint
fsp_header_get_tablespace_size(
/*===========================*/
/* out: size in pages */
- ulint space) /* in: space id */
+ ulint space) /* in: space id, must be 0 */
{
fsp_header_t* header;
ulint size;
@@ -965,40 +1034,80 @@ fsp_header_get_tablespace_size(
}
/***************************************************************************
-Tries to extend the last data file file if it is defined as auto-extending. */
+Tries to extend a single-table tablespace so that a page would fit in the
+data file. */
+static
+ibool
+fsp_try_extend_data_file_with_pages(
+/*================================*/
+ /* out: TRUE if success */
+ ulint space, /* in: space */
+ ulint page_no, /* in: page number */
+ fsp_header_t* header, /* in: space header */
+ mtr_t* mtr) /* in: mtr */
+{
+ ibool success;
+ ulint actual_size;
+ ulint size;
+
+ ut_a(space != 0);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ ut_a(page_no >= size);
+
+ success = fil_extend_space_to_desired_size(&actual_size, space,
+ page_no + 1);
+ /* actual_size now has the space size in pages; it may be less than
+ we wanted if we ran out of disk space */
+
+ mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr);
+
+ return(success);
+}
+
+/***************************************************************************
+Tries to extend the last data file of a tablespace if it is auto-extending. */
static
ibool
-fsp_try_extend_last_file(
+fsp_try_extend_data_file(
/*=====================*/
/* out: FALSE if not auto-extending */
- ulint* actual_increase,/* out: actual increase in pages */
+ ulint* actual_increase,/* out: actual increase in pages, where
+ we measure the tablespace size from
+ what the header field says; it may be
+ the actual file size rounded down to
+ megabyte */
ulint space, /* in: space */
fsp_header_t* header, /* in: space header */
mtr_t* mtr) /* in: mtr */
{
ulint size;
+ ulint new_size;
+ ulint old_size;
ulint size_increase;
+ ulint actual_size;
ibool success;
- ut_a(space == 0);
-
*actual_increase = 0;
- if (!srv_auto_extend_last_data_file) {
+ if (space == 0 && !srv_auto_extend_last_data_file) {
return(FALSE);
}
size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
- if (srv_last_file_size_max != 0) {
+ old_size = size;
+
+ if (space == 0 && srv_last_file_size_max != 0) {
if (srv_last_file_size_max
< srv_data_file_sizes[srv_n_data_files - 1]) {
fprintf(stderr,
"InnoDB: Error: Last data file size is %lu, max size allowed %lu\n",
- srv_data_file_sizes[srv_n_data_files - 1],
- srv_last_file_size_max);
+ (ulong) srv_data_file_sizes[srv_n_data_files - 1],
+ (ulong) srv_last_file_size_max);
}
size_increase = srv_last_file_size_max
@@ -1007,24 +1116,58 @@ fsp_try_extend_last_file(
size_increase = SRV_AUTO_EXTEND_INCREMENT;
}
} else {
- size_increase = SRV_AUTO_EXTEND_INCREMENT;
+ if (space == 0) {
+ size_increase = SRV_AUTO_EXTEND_INCREMENT;
+ } else {
+ /* We extend single-table tablespaces first one extent
+ at a time, but for bigger tablespaces more. It is not
+ enough to extend always by one extent, because some
+ extents are frag page extents. */
+
+ if (size < FSP_EXTENT_SIZE) {
+ /* Let us first extend the file to 64 pages */
+ success = fsp_try_extend_data_file_with_pages(
+ space, FSP_EXTENT_SIZE - 1,
+ header, mtr);
+ if (!success) {
+ new_size = mtr_read_ulint(
+ header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ *actual_increase = new_size - old_size;
+
+ return(FALSE);
+ }
+
+ size = FSP_EXTENT_SIZE;
+ }
+
+ if (size < 32 * FSP_EXTENT_SIZE) {
+ size_increase = FSP_EXTENT_SIZE;
+ } else {
+ /* Below in fsp_fill_free_list() we assume
+ that we add at most FSP_FREE_ADD extents at
+ a time */
+ size_increase = FSP_FREE_ADD * FSP_EXTENT_SIZE;
+ }
+ }
}
if (size_increase == 0) {
+
return(TRUE);
}
- /* Extend the data file. If we are not able to extend
- the full requested length, the function tells us
- the number of full megabytes (but the unit is pages!)
- we were able to extend. */
-
- success = fil_extend_last_data_file(actual_increase, size_increase);
+ success = fil_extend_space_to_desired_size(&actual_size, space,
+ size + size_increase);
+ /* We ignore any fragments of a full megabyte when storing the size
+ to the space header */
- if (success) {
- mlog_write_ulint(header + FSP_SIZE, size + *actual_increase,
+ mlog_write_ulint(header + FSP_SIZE,
+ ut_calc_align_down(actual_size, (1024 * 1024) / UNIV_PAGE_SIZE),
MLOG_4BYTES, mtr);
- }
+ new_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ *actual_increase = new_size - old_size;
return(TRUE);
}
@@ -1037,9 +1180,14 @@ static
void
fsp_fill_free_list(
/*===============*/
- ulint space, /* in: space */
- fsp_header_t* header, /* in: space header */
- mtr_t* mtr) /* in: mtr */
+ ibool init_space, /* in: TRUE if this is a single-table
+ tablespace and we are only initing
+ the tablespace's first extent
+ descriptor page and ibuf bitmap page;
+ then we do not allocate more extents */
+ ulint space, /* in: space */
+ fsp_header_t* header, /* in: space header */
+ mtr_t* mtr) /* in: mtr */
{
ulint limit;
ulint size;
@@ -1058,27 +1206,37 @@ fsp_fill_free_list(
size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
- if (srv_auto_extend_last_data_file
+ if (space == 0 && srv_auto_extend_last_data_file
&& size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
/* Try to increase the last data file size */
- fsp_try_extend_last_file(&actual_increase, space, header,
- mtr);
+ fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+ }
+
+ if (space != 0 && !init_space
+ && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+ /* Try to increase the .ibd file size */
+ fsp_try_extend_data_file(&actual_increase, space, header, mtr);
size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
}
i = limit;
- while ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD)) {
+ while ((init_space && i < 1)
+ || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) {
mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
MLOG_4BYTES, mtr);
/* Update the free limit info in the log system and make
a checkpoint */
- log_fsp_current_free_limit_set_and_checkpoint(
+ if (space == 0) {
+ log_fsp_current_free_limit_set_and_checkpoint(
(i + FSP_EXTENT_SIZE)
/ ((1024 * 1024) / UNIV_PAGE_SIZE));
+ }
if (0 == i % XDES_DESCRIBED_PER_PAGE) {
@@ -1088,10 +1246,6 @@ fsp_fill_free_list(
if (i > 0) {
descr_page = buf_page_create(space, i, mtr);
-#ifdef UNIV_SYNC_DEBUG
- buf_page_dbg_add_level(descr_page,
- SYNC_FSP_PAGE);
-#endif /* UNIV_SYNC_DEBUG */
buf_page_get(space, i, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(descr_page,
@@ -1100,7 +1254,7 @@ fsp_fill_free_list(
fsp_init_file_page(descr_page, mtr);
}
- /* Initialize the ibuf page in a separate
+ /* Initialize the ibuf bitmap page in a separate
mini-transaction because it is low in the latching
order, and we must be able to release its latch
before returning from the fsp routine */
@@ -1109,9 +1263,6 @@ fsp_fill_free_list(
ibuf_page = buf_page_create(space,
i + FSP_IBUF_BITMAP_OFFSET, &ibuf_mtr);
-#ifdef UNIV_SYNC_DEBUG
- buf_page_dbg_add_level(ibuf_page, SYNC_IBUF_BITMAP);
-#endif /* UNIV_SYNC_DEBUG */
buf_page_get(space, i + FSP_IBUF_BITMAP_OFFSET,
RW_X_LATCH, &ibuf_mtr);
#ifdef UNIV_SYNC_DEBUG
@@ -1188,7 +1339,7 @@ fsp_alloc_free_extent(
first = flst_get_first(header + FSP_FREE, mtr);
if (fil_addr_is_null(first)) {
- fsp_fill_free_list(space, header, mtr);
+ fsp_fill_free_list(FALSE, space, header, mtr);
first = flst_get_first(header + FSP_FREE, mtr);
}
@@ -1225,6 +1376,8 @@ fsp_alloc_free_page(
ulint free;
ulint frag_n_used;
ulint page_no;
+ ulint space_size;
+ ibool success;
ut_ad(mtr);
@@ -1278,6 +1431,30 @@ fsp_alloc_free_page(
ut_error;
}
+ page_no = xdes_get_offset(descr) + free;
+
+ space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ if (space_size <= page_no) {
+ /* It must be that we are extending a single-table tablespace
+ whose size is still < 64 pages */
+
+ ut_a(space != 0);
+ if (page_no >= FSP_EXTENT_SIZE) {
+ fprintf(stderr,
+"InnoDB: Error: trying to extend a single-table tablespace %lu\n"
+"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n",
+ (ulong) space, (ulong) space_size, (ulong) page_no);
+ return(FIL_NULL);
+ }
+ success = fsp_try_extend_data_file_with_pages(space, page_no,
+ header, mtr);
+ if (!success) {
+ /* No disk space left */
+ return(FIL_NULL);
+ }
+ }
+
xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
/* Update the FRAG_N_USED field */
@@ -1299,8 +1476,6 @@ fsp_alloc_free_page(
mtr);
}
- page_no = xdes_get_offset(descr) + free;
-
/* Initialize the allocated page to the buffer pool, so that it can
be obtained immediately with buf_page_get without need for a disk
read. */
@@ -1347,7 +1522,8 @@ fsp_free_page(
if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) {
fprintf(stderr,
"InnoDB: Error: File space extent descriptor of page %lu has state %lu\n",
- page, state);
+ (ulong) page,
+ (ulong) state);
ut_sprintf_buf(buf, ((byte*)descr) - 50, 200);
fprintf(stderr, "InnoDB: Dump of descriptor: %s\n", buf);
@@ -1366,7 +1542,7 @@ fsp_free_page(
== TRUE) {
fprintf(stderr,
"InnoDB: Error: File space extent descriptor of page %lu says it is free\n",
- page);
+ (ulong) page);
ut_sprintf_buf(buf, ((byte*)descr) - 50, 200);
fprintf(stderr, "InnoDB: Dump of descriptor: %s\n", buf);
@@ -1602,8 +1778,8 @@ fsp_alloc_seg_inode(
inode = fsp_seg_inode_page_get_nth_inode(page, n, mtr);
- if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, mtr)) {
-
+ if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1,
+ mtr)) {
/* There are no other unused headers left on the page: move it
to another list */
@@ -1657,7 +1833,7 @@ fsp_free_seg_inode(
flst_remove(space_header + FSP_SEG_INODES_FREE,
page + FSEG_INODE_PAGE_NODE, mtr);
- fsp_free_page(space, buf_frame_get_page_no(page), mtr);
+ fsp_free_page(space, buf_frame_get_page_no(page), mtr);
}
}
@@ -1821,12 +1997,12 @@ fseg_create_general(
will belong to the created segment */
ulint byte_offset, /* in: byte offset of the created segment header
on the page */
- ibool has_done_reservation, /* in: TRUE if the caller has
- already done the reservation for the pages
- with fsp_reserve_free_extents (at least 2 extents:
- one for the inode and, then there other for the
- segment) is no need to do the check for this
- individual operation */
+ ibool has_done_reservation, /* in: TRUE if the caller has already
+ done the reservation for the pages with
+ fsp_reserve_free_extents (at least 2 extents: one for
+ the inode and the other for the segment) then there is
+ no need to do the check for this individual
+ operation */
mtr_t* mtr) /* in: mtr */
{
fsp_header_t* space_header;
@@ -1835,6 +2011,7 @@ fseg_create_general(
fseg_header_t* header = 0; /* remove warning */
rw_lock_t* latch;
ibool success;
+ ulint n_reserved;
page_t* ret = NULL;
ulint i;
@@ -1858,12 +2035,14 @@ fseg_create_general(
/* This thread did not own the latch before this call: free
excess pages from the insert buffer free list */
- ibuf_free_excess_pages(space);
+ if (space == 0) {
+ ibuf_free_excess_pages(space);
+ }
}
if (!has_done_reservation) {
- success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr);
-
+ success = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
if (!success) {
return(NULL);
}
@@ -1926,7 +2105,7 @@ fseg_create_general(
funct_exit:
if (!has_done_reservation) {
- fil_space_release_free_extents(space, 2);
+ fil_space_release_free_extents(space, n_reserved);
}
return(ret);
@@ -2144,6 +2323,8 @@ fseg_alloc_free_page_low(
FSP_UP, FSP_NO_DIR */
mtr_t* mtr) /* in: mtr handle */
{
+ fsp_header_t* space_header;
+ ulint space_size;
dulint seg_id;
ulint used;
ulint reserved;
@@ -2154,6 +2335,7 @@ fseg_alloc_free_page_low(
xdes_t* ret_descr; /* the extent of the allocated page */
page_t* page;
ibool frag_page_allocated = FALSE;
+ ibool success;
ulint n;
ut_ad(mtr);
@@ -2166,8 +2348,10 @@ fseg_alloc_free_page_low(
reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr);
- descr = xdes_get_descriptor(space, hint, mtr);
+ space_header = fsp_get_space_header(space, mtr);
+ descr = xdes_get_descriptor_with_space_hdr(space_header, space,
+ hint, mtr);
if (descr == NULL) {
/* Hint outside space or too high above free limit: reset
hint */
@@ -2297,8 +2481,32 @@ fseg_alloc_free_page_low(
return(FIL_NULL);
}
- if (!frag_page_allocated) {
+ if (space != 0) {
+ space_size = fil_space_get_size(space);
+
+ if (space_size <= ret_page) {
+ /* It must be that we are extending a single-table
+ tablespace whose size is still < 64 pages */
+
+ if (ret_page >= FSP_EXTENT_SIZE) {
+ fprintf(stderr,
+"InnoDB: Error (2): trying to extend a single-table tablespace %lu\n"
+"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n",
+ (ulong) space, (ulong) space_size,
+ (ulong) ret_page);
+ return(FIL_NULL);
+ }
+
+ success = fsp_try_extend_data_file_with_pages(space,
+ ret_page, space_header, mtr);
+ if (!success) {
+ /* No disk space left */
+ return(FIL_NULL);
+ }
+ }
+ }
+ if (!frag_page_allocated) {
/* Initialize the allocated page to buffer pool, so that it
can be obtained immediately with buf_page_get without need
for a disk read */
@@ -2359,6 +2567,7 @@ fseg_alloc_free_page_general(
rw_lock_t* latch;
ibool success;
ulint page_no;
+ ulint n_reserved;
space = buf_frame_get_space_id(seg_header);
@@ -2375,14 +2584,16 @@ fseg_alloc_free_page_general(
/* This thread did not own the latch before this call: free
excess pages from the insert buffer free list */
- ibuf_free_excess_pages(space);
+ if (space == 0) {
+ ibuf_free_excess_pages(space);
+ }
}
inode = fseg_inode_get(seg_header, mtr);
if (!has_done_reservation) {
- success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr);
-
+ success = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
if (!success) {
return(FIL_NULL);
}
@@ -2391,7 +2602,7 @@ fseg_alloc_free_page_general(
page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode),
inode, hint, direction, mtr);
if (!has_done_reservation) {
- fil_space_release_free_extents(space, 2);
+ fil_space_release_free_extents(space, n_reserved);
}
return(page_no);
@@ -2421,6 +2632,46 @@ fseg_alloc_free_page(
}
/**************************************************************************
+Checks that we have at least 2 frag pages free in the first extent of a
+single-table tablespace, and they are also physically initialized to the data
+file. That is we have already extended the data file so that those pages are
+inside the data file. If not, this function extends the tablespace with
+pages. */
+static
+ibool
+fsp_reserve_free_pages(
+/*===================*/
+ /* out: TRUE if there were >= 3 free
+ pages, or we were able to extend */
+ ulint space, /* in: space id, must be != 0 */
+ fsp_header_t* space_header, /* in: header of that space,
+ x-latched */
+ ulint size, /* in: size of the tablespace in pages,
+ must be < FSP_EXTENT_SIZE / 2 */
+ mtr_t* mtr) /* in: mtr */
+{
+ xdes_t* descr;
+ ulint n_used;
+
+ ut_a(space != 0);
+ ut_a(size < FSP_EXTENT_SIZE / 2);
+
+ descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0,
+ mtr);
+ n_used = xdes_get_n_used(descr, mtr);
+
+ ut_a(n_used <= size);
+
+ if (size >= n_used + 2) {
+
+ return(TRUE);
+ }
+
+ return(fsp_try_extend_data_file_with_pages(space, n_used + 1,
+ space_header, mtr));
+}
+
+/**************************************************************************
Reserves free pages from a tablespace. All mini-transactions which may
use several pages from the tablespace should call this function beforehand
and reserve enough free extents so that they certainly will be able
@@ -2438,12 +2689,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations
will not succeed, but the latter two allocations will succeed, if possible.
The purpose is to avoid dead end where the database is full but the
user cannot free any space because these freeing operations temporarily
-reserve some space. */
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available. */
ibool
fsp_reserve_free_extents(
/*=====================*/
/* out: TRUE if we were able to make the reservation */
+ ulint* n_reserved,/* out: number of extents actually reserved; if we
+ return TRUE and the tablespace size is < 64 pages,
+ then this can be 0, otherwise it is n_ext */
ulint space, /* in: space id */
ulint n_ext, /* in: number of extents to reserve */
ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
@@ -2466,6 +2726,8 @@ fsp_reserve_free_extents(
|| mtr_memo_contains(mtr, fil_space_get_latch(space),
MTR_MEMO_X_LOCK));
#endif /* UNIV_SYNC_DEBUG */
+ *n_reserved = n_ext;
+
latch = fil_space_get_latch(space);
mtr_x_lock(latch, mtr);
@@ -2474,6 +2736,12 @@ fsp_reserve_free_extents(
try_again:
size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr);
+ if (size < FSP_EXTENT_SIZE / 2) {
+ /* Use different rules for small single-table tablespaces */
+ *n_reserved = 0;
+ return(fsp_reserve_free_pages(space, space_header, size, mtr));
+ }
+
n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr);
free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
@@ -2523,7 +2791,7 @@ try_again:
return(TRUE);
}
try_to_extend:
- success = fsp_try_extend_last_file(&n_pages_added, space,
+ success = fsp_try_extend_data_file(&n_pages_added, space,
space_header, mtr);
if (success && n_pages_added > 0) {
@@ -2574,6 +2842,13 @@ fsp_get_available_space_in_free_extents(
MLOG_4BYTES, &mtr);
mtr_commit(&mtr);
+ if (size < FSP_EXTENT_SIZE) {
+ ut_a(space != 0); /* This must be a single-table
+ tablespace */
+ return(0); /* TODO: count free frag pages and return
+ a value based on that */
+ }
+
/* Below we play safe when counting free extents above the free limit:
some of them will contain extent descriptor pages, and therefore
will not be free extents */
@@ -2671,14 +2946,10 @@ fseg_free_page_low(
xdes_t* descr;
ulint not_full_n_used;
ulint state;
+ dulint descr_id;
+ dulint seg_id;
ulint i;
- char errbuf[200];
-
-#ifdef __WIN__
- dulint desm;
- dulint segm;
-#endif
-
+ char errbuf[200];
ut_ad(seg_inode && mtr);
ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) ==
@@ -2703,7 +2974,7 @@ fseg_free_page_low(
"InnoDB: though it is already marked as free in the tablespace!\n"
"InnoDB: The tablespace free space info is corrupt.\n"
"InnoDB: You may need to dump your InnoDB tables and recreate the whole\n"
-"InnoDB: database!\n", page);
+"InnoDB: database!\n", (ulong) page);
fprintf(stderr,
"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n"
@@ -2731,26 +3002,22 @@ fseg_free_page_low(
return;
}
+ /* If we get here, the page is in some extent of the segment */
+
+ descr_id = mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr);
+ seg_id = mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr);
/*
fprintf(stderr,
"InnoDB: InnoDB is freeing space %lu page %lu,\n"
"InnoDB: which belongs to descr seg %lu %lu\n"
"InnoDB: segment %lu %lu.\n",
space, page,
- ut_dulint_get_high(
- mtr_read_dulint(descr + XDES_ID, mtr)),
- ut_dulint_get_low(
- mtr_read_dulint(descr + XDES_ID, mtr)),
- ut_dulint_get_high(
- mtr_read_dulint(seg_inode + FSEG_ID, mtr)),
- ut_dulint_get_low(
- mtr_read_dulint(seg_inode + FSEG_ID, mtr)));
+ ut_dulint_get_high(descr_id),
+ ut_dulint_get_low(descr_id),
+ ut_dulint_get_high(seg_id),
+ ut_dulint_get_low(seg_id));
*/
- /* If we get here, the page is in some extent of the segment */
- if (0 != ut_dulint_cmp(
- mtr_read_dulint(descr + XDES_ID, mtr),
- mtr_read_dulint(seg_inode + FSEG_ID, mtr))) {
-
+ if (0 != ut_dulint_cmp(descr_id, seg_id)) {
ut_sprintf_buf(errbuf, descr, 40);
fprintf(stderr,
"InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf);
@@ -2758,42 +3025,15 @@ fseg_free_page_low(
fprintf(stderr,
"InnoDB: Dump of the segment inode: %s\n", errbuf);
-
-#ifndef __WIN__
-
- fprintf(stderr,
-"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n"
-"InnoDB: which does not belong to segment %lu %lu but belongs\n"
-"InnoDB: to segment %lu %lu.\n",
- space, page,
- ut_dulint_get_high(
- mtr_read_dulint(descr + XDES_ID, mtr)),
- ut_dulint_get_low(
- mtr_read_dulint(descr + XDES_ID, mtr)),
- ut_dulint_get_high(
- mtr_read_dulint(seg_inode + FSEG_ID, mtr)),
- ut_dulint_get_low(
- mtr_read_dulint(seg_inode + FSEG_ID, mtr)));
-
-#else
-
-/* More pedantic usage to avoid VC++ 6.0 compiler errors due to inline
- function expansion issues */
-
- desm = mtr_read_dulint(descr + XDES_ID, mtr);
- segm = mtr_read_dulint(seg_inode + FSEG_ID, mtr);
-
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n"
"InnoDB: which does not belong to segment %lu %lu but belongs\n"
"InnoDB: to segment %lu %lu.\n",
- space, page,
- ut_dulint_get_high(desm),
- ut_dulint_get_low(desm),
- ut_dulint_get_high(segm),
- ut_dulint_get_low(segm));
-
-#endif
+ (ulong) space, (ulong) page,
+ (ulong) ut_dulint_get_high(descr_id),
+ (ulong) ut_dulint_get_low(descr_id),
+ (ulong) ut_dulint_get_high(seg_id),
+ (ulong) ut_dulint_get_low(seg_id));
fprintf(stderr,
"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n"
@@ -3313,11 +3553,13 @@ fseg_print_low(
printf(
"SEGMENT id %lu %lu space %lu; page %lu; res %lu used %lu; full ext %lu\n",
- seg_id_high, seg_id_low, space, page_no, reserved, used,
- n_full);
+ (ulong) seg_id_high, (ulong) seg_id_low, (ulong) space,
+ (ulong) page_no, (ulong) reserved, (ulong) used,
+ (ulong) n_full);
printf(
"fragm pages %lu; free extents %lu; not full extents %lu: pages %lu\n",
- n_frag, n_free, n_not_full, n_used);
+ (ulong) n_frag, (ulong) n_free, (ulong) n_not_full,
+ (ulong) n_used);
}
/***********************************************************************
@@ -3388,7 +3630,7 @@ fsp_validate(
n_full_frag_pages = FSP_EXTENT_SIZE *
flst_get_len(header + FSP_FULL_FRAG, &mtr);
- ut_a(free_limit <= size);
+ ut_a(free_limit <= size || (space != 0 && size < FSP_EXTENT_SIZE));
flst_validate(header + FSP_FREE, &mtr);
flst_validate(header + FSP_FREE_FRAG, &mtr);
@@ -3620,15 +3862,16 @@ fsp_print(
seg_id_low = ut_dulint_get_low(d_var);
seg_id_high = ut_dulint_get_high(d_var);
- printf("FILE SPACE INFO: id %lu\n", space);
+ printf("FILE SPACE INFO: id %lu\n", (ulong) space);
printf("size %lu, free limit %lu, free extents %lu\n",
- size, free_limit, n_free);
+ (ulong) size, (ulong) free_limit, (ulong) n_free);
printf(
"not full frag extents %lu: used pages %lu, full frag extents %lu\n",
- n_free_frag, frag_n_used, n_full_frag);
+ (ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag);
- printf("first seg id not used %lu %lu\n", seg_id_high, seg_id_low);
+ printf("first seg id not used %lu %lu\n", (ulong) seg_id_high,
+ (ulong) seg_id_low);
mtr_commit(&mtr);
@@ -3707,5 +3950,5 @@ fsp_print(
mtr_commit(&mtr2);
- printf("NUMBER of file segments: %lu\n", n_segs);
+ printf("NUMBER of file segments: %lu\n", (ulong) n_segs);
}
diff --git a/innobase/fut/fut0lst.c b/innobase/fut/fut0lst.c
index 4328fc97b33..79830c36eb5 100644
--- a/innobase/fut/fut0lst.c
+++ b/innobase/fut/fut0lst.c
@@ -511,6 +511,7 @@ flst_print(
printf("FILE-BASED LIST:\n");
printf("Base node in space %lu page %lu byte offset %lu; len %lu\n",
- buf_frame_get_space_id(frame), buf_frame_get_page_no(frame),
- (ulint) (base - frame), len);
+ (ulong) buf_frame_get_space_id(frame),
+ (ulong) buf_frame_get_page_no(frame),
+ (ulong) (base - frame), (ulong) len);
}
diff --git a/innobase/ha/ha0ha.c b/innobase/ha/ha0ha.c
index ad833312963..5e807406ce0 100644
--- a/innobase/ha/ha0ha.c
+++ b/innobase/ha/ha0ha.c
@@ -34,6 +34,12 @@ ha_create(
table = hash_create(n);
+ if (in_btr_search) {
+ table->adaptive = TRUE;
+ } else {
+ table->adaptive = FALSE;
+ }
+
if (n_mutexes == 0) {
if (in_btr_search) {
table->heap = mem_heap_create_in_btr_search(4096);
@@ -79,6 +85,7 @@ ha_insert_for_fold(
hash_cell_t* cell;
ha_node_t* node;
ha_node_t* prev_node;
+ buf_block_t* prev_block;
ulint hash;
ut_ad(table && data);
@@ -93,6 +100,12 @@ ha_insert_for_fold(
while (prev_node != NULL) {
if (prev_node->fold == fold) {
+ if (table->adaptive) {
+ prev_block = buf_block_align(prev_node->data);
+ ut_a(prev_block->n_pointers > 0);
+ prev_block->n_pointers--;
+ buf_block_align(data)->n_pointers++;
+ }
prev_node->data = data;
@@ -116,6 +129,11 @@ ha_insert_for_fold(
}
ha_node_set_data(node, data);
+
+ if (table->adaptive) {
+ buf_block_align(data)->n_pointers++;
+ }
+
node->fold = fold;
node->next = NULL;
@@ -148,6 +166,11 @@ ha_delete_hash_node(
hash_table_t* table, /* in: hash table */
ha_node_t* del_node) /* in: node to be deleted */
{
+ if (table->adaptive) {
+ ut_a(buf_block_align(del_node->data)->n_pointers > 0);
+ buf_block_align(del_node->data)->n_pointers--;
+ }
+
HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node);
}
@@ -174,6 +197,35 @@ ha_delete(
ha_delete_hash_node(table, node);
}
+/*************************************************************
+Looks for an element when we know the pointer to the data, and updates
+the pointer to data, if found. */
+
+void
+ha_search_and_update_if_found(
+/*==========================*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data, /* in: pointer to the data */
+ void* new_data)/* in: new pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+ if (table->adaptive) {
+ ut_a(buf_block_align(node->data)->n_pointers > 0);
+ buf_block_align(node->data)->n_pointers--;
+ buf_block_align(new_data)->n_pointers++;
+ }
+
+ node->data = new_data;
+ }
+}
+
/*********************************************************************
Removes from the chain determined by fold all nodes whose data pointer
points to the page given. */
@@ -205,10 +257,10 @@ ha_remove_all_nodes_to_page(
node = ha_chain_get_first(table, fold);
} else {
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
}
-
+#ifdef UNIV_DEBUG
/* Check that all nodes really got deleted */
node = ha_chain_get_first(table, fold);
@@ -216,8 +268,9 @@ ha_remove_all_nodes_to_page(
while (node) {
ut_a(buf_frame_align(ha_node_get_data(node)) != page);
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
+#endif
}
/*****************************************************************
@@ -246,7 +299,7 @@ ha_validate(
fprintf(stderr,
"InnoDB: Error: hash table node fold value %lu does not\n"
"InnoDB: match with the cell number %lu.\n",
- node->fold, i);
+ (ulong) node->fold, (ulong) i);
ok = FALSE;
}
@@ -269,12 +322,10 @@ ha_print_info(
hash_table_t* table) /* in: hash table */
{
hash_cell_t* cell;
-/*
- ha_node_t* node;
- ulint len = 0;
- ulint max_len = 0;
+/* ha_node_t* node;
ulint nodes = 0;
-*/
+ ulint len = 0;
+ ulint max_len = 0; */
ulint cells = 0;
ulint n_bufs;
ulint i;
@@ -315,7 +366,8 @@ ha_print_info(
}
buf += sprintf(buf,
-"Hash table size %lu, used cells %lu", hash_get_n_cells(table), cells);
+"Hash table size %lu, used cells %lu", (ulong) hash_get_n_cells(table),
+ (ulong) cells);
if (table->heaps == NULL && table->heap != NULL) {
@@ -328,6 +380,6 @@ ha_print_info(
n_bufs++;
}
- buf += sprintf(buf, ", node heap has %lu buffer(s)\n", n_bufs);
+ buf += sprintf(buf, ", node heap has %lu buffer(s)\n", (ulong) n_bufs);
}
}
diff --git a/innobase/ha/hash0hash.c b/innobase/ha/hash0hash.c
index 808aa88da3d..372104e54b3 100644
--- a/innobase/ha/hash0hash.c
+++ b/innobase/ha/hash0hash.c
@@ -61,6 +61,7 @@ hash_create(
array = ut_malloc(sizeof(hash_cell_t) * prime);
+ table->adaptive = FALSE;
table->array = array;
table->n_cells = prime;
table->n_mutexes = 0;
diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
index f2c631d88cd..42ca34e7f10 100644
--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@@ -29,6 +29,35 @@ Created 7/19/1997 Heikki Tuuri
#include "log0recv.h"
#include "que0que.h"
+/* STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+ field. We store the information which affects the ordering of records, and
+ also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+ is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker which differentiates records from
+ the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+ store the charset. In the compressed table format of 5.0.x we must add more
+ information here so that we can build a dummy 'index' struct which 5.0.x
+ can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+*/
+
+
/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
If an OS thread performs any operation that brings in disk pages from
@@ -45,20 +74,20 @@ because they own x-latches to pages which are on a lower level than the
insert buffer tree latch, its page latches, and the tablespace latch an
insert buffer operation can reserve.
-The solution is the following: We put into each tablespace an insert buffer
-of its own. Let all the tree and page latches connected with the insert buffer
-be later in the latching order than the fsp latch and fsp page latches.
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
Insert buffer pages must be such that the insert buffer is never invoked
when these pages are accessed as this would result in a recursion violating
the latching order. We let a special i/o-handler thread take care of i/o to
the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
pages and the first inode page, which contains the inode of the ibuf tree: let
-us call all these ibuf pages. If the OS does not support asynchronous i/o,
-then there is no special i/o thread, but to prevent deadlocks, we do not let a
-read-ahead access both non-ibuf and ibuf pages.
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
-Then an i/o-handler for the insert buffer never needs to access the insert
-buffer tree and thus obeys the latching order. On the other hand, other
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
i/o-handlers for other tablespaces may require access to the insert buffer,
but because all kinds of latches they need to access there are later in the
latching order, no violation of the latching order occurs in this case,
@@ -95,8 +124,8 @@ the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
-it uses synchronous aio or the OS does not support aio, it can access any
-pages, as long as it obeys the access order rules. */
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
/* Buffer pool size per the maximum insert buffer size */
#define IBUF_POOL_SIZE_PER_MAX_SIZE 2
@@ -109,8 +138,8 @@ ulint ibuf_rnd = 986058871;
ulint ibuf_flush_count = 0;
/* Dimensions for the ibuf_count array */
-#define IBUF_COUNT_N_SPACES 10
-#define IBUF_COUNT_N_PAGES 10000
+#define IBUF_COUNT_N_SPACES 500
+#define IBUF_COUNT_N_PAGES 2000
/* Buffered entry counts for file pages, used in debugging */
ulint* ibuf_counts[IBUF_COUNT_N_SPACES];
@@ -235,6 +264,8 @@ ibuf_header_page_get(
{
page_t* page;
+ ut_a(space == 0);
+
ut_ad(!ibuf_inside());
page = buf_page_get(space, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr);
@@ -259,6 +290,7 @@ ibuf_tree_root_get(
{
page_t* page;
+ ut_a(space == 0);
ut_ad(ibuf_inside());
mtr_x_lock(dict_tree_get_lock((data->index)->tree), mtr);
@@ -271,7 +303,7 @@ ibuf_tree_root_get(
return(page);
}
-
+
/**********************************************************************
Gets the ibuf count for a given page. */
@@ -294,9 +326,9 @@ ibuf_count_get(
return(*(ibuf_counts[space] + page_no));
}
+#ifdef UNIV_IBUF_DEBUG
/**********************************************************************
Sets the ibuf count for a given page. */
-#ifdef UNIV_IBUF_DEBUG
static
void
ibuf_count_set(
@@ -305,17 +337,17 @@ ibuf_count_set(
ulint page_no,/* in: page number */
ulint val) /* in: value to set */
{
- ut_ad(space < IBUF_COUNT_N_SPACES);
- ut_ad(page_no < IBUF_COUNT_N_PAGES);
- ut_ad(val < UNIV_PAGE_SIZE);
+ ut_a(space < IBUF_COUNT_N_SPACES);
+ ut_a(page_no < IBUF_COUNT_N_PAGES);
+ ut_a(val < UNIV_PAGE_SIZE);
*(ibuf_counts[space] + page_no) = val;
}
#endif
/**********************************************************************
-Creates the insert buffer data structure at a database startup and
-initializes the data structures for the insert buffer of each tablespace. */
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer. */
void
ibuf_init_at_db_start(void)
@@ -407,19 +439,19 @@ ibuf_data_sizes_update(
/* printf("ibuf size %lu, space ibuf size %lu\n", ibuf->size,
data->size); */
-}
+}
/**********************************************************************
Creates the insert buffer data struct for a single tablespace. Reads the
root page of the insert buffer tree in the tablespace. This function can
be called only after the dictionary system has been initialized, as this
-creates also the insert buffer table and index for this tablespace. */
+creates also the insert buffer table and index into this tablespace. */
ibuf_data_t*
ibuf_data_init_for_space(
/*=====================*/
/* out, own: ibuf data struct, linked to the list
- in ibuf control structure. */
+ in ibuf control structure */
ulint space) /* in: space id */
{
ibuf_data_t* data;
@@ -431,6 +463,8 @@ ibuf_data_init_for_space(
dict_index_t* index;
ulint n_used;
+ ut_a(space == 0);
+
#ifdef UNIV_LOG_DEBUG
if (space % 2 == 1) {
@@ -471,14 +505,22 @@ ibuf_data_init_for_space(
data->n_merged_recs = 0;
ibuf_data_sizes_update(data, root, &mtr);
-
+/*
+ if (!data->empty) {
+ fprintf(stderr,
+"InnoDB: index entries found in the insert buffer\n");
+ } else {
+ fprintf(stderr,
+"InnoDB: insert buffer empty\n");
+ }
+*/
mutex_exit(&ibuf_mutex);
mtr_commit(&mtr);
ibuf_exit();
- sprintf(buf, "SYS_IBUF_TABLE_%lu", space);
+ sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space);
table = dict_mem_table_create(buf, space, 2);
@@ -684,7 +726,7 @@ ibuf_bitmap_get_map_page(
mtr_t* mtr) /* in: mtr */
{
page_t* page;
-
+
page = buf_page_get(space, ibuf_bitmap_page_no_calc(page_no),
RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
@@ -897,7 +939,7 @@ UNIV_INLINE
ibool
ibuf_fixed_addr_page(
/*=================*/
- /* out: TRUE if a fixed address ibuf i/o page */
+ /* out: TRUE if a fixed address ibuf i/o page */
ulint page_no)/* in: page number */
{
if ((ibuf_bitmap_page(page_no))
@@ -934,6 +976,12 @@ ibuf_page(
return(TRUE);
}
+ if (space != 0) {
+ /* Currently we only have an ibuf tree in space 0 */
+
+ return(FALSE);
+ }
+
ut_ad(fil_space_get_type(space) == FIL_TABLESPACE);
mtr_start(&mtr);
@@ -998,14 +1046,60 @@ ibuf_rec_get_page_no(
ut_ad(ibuf_inside());
ut_ad(rec_get_n_fields(rec) > 2);
- field = rec_get_nth_field(rec, 0, &len);
+ field = rec_get_nth_field(rec, 1, &len);
- ut_ad(len == 4);
+ if (len == 1) {
+ /* This is of the >= 4.1.x record format */
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ field = rec_get_nth_field(rec, 2, &len);
+ } else {
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ field = rec_get_nth_field(rec, 0, &len);
+ }
+
+ ut_a(len == 4);
return(mach_read_from_4(field));
}
/************************************************************************
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0. */
+static
+ulint
+ibuf_rec_get_space(
+/*===============*/
+ /* out: space id */
+ rec_t* rec) /* in: ibuf record */
+{
+ byte* field;
+ ulint len;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields(rec) > 2);
+
+ field = rec_get_nth_field(rec, 1, &len);
+
+ if (len == 1) {
+ /* This is of the >= 4.1.x record format */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ field = rec_get_nth_field(rec, 0, &len);
+ ut_a(len == 4);
+
+ return(mach_read_from_4(field));
+ }
+
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ return(0);
+}
+
+/************************************************************************
Returns the space taken by a stored non-clustered index entry if converted to
an index record. */
static
@@ -1017,6 +1111,7 @@ ibuf_rec_get_volume(
rec_t* ibuf_rec)/* in: ibuf record */
{
dtype_t dtype;
+ ibool new_format = FALSE;
ulint data_size = 0;
ulint n_fields;
byte* types;
@@ -1027,17 +1122,42 @@ ibuf_rec_get_volume(
ut_ad(ibuf_inside());
ut_ad(rec_get_n_fields(ibuf_rec) > 2);
- n_fields = rec_get_n_fields(ibuf_rec) - 2;
+ data = rec_get_nth_field(ibuf_rec, 1, &len);
- types = rec_get_nth_field(ibuf_rec, 1, &len);
+ if (len > 1) {
+ /* < 4.1.x format record */
- ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ n_fields = rec_get_n_fields(ibuf_rec) - 2;
+
+ types = rec_get_nth_field(ibuf_rec, 1, &len);
+
+ ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ } else {
+ /* >= 4.1.x format record */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ new_format = TRUE;
+
+ n_fields = rec_get_n_fields(ibuf_rec) - 4;
+
+ types = rec_get_nth_field(ibuf_rec, 3, &len);
+ }
for (i = 0; i < n_fields; i++) {
- data = rec_get_nth_field(ibuf_rec, i + 2, &len);
+ if (new_format) {
+ data = rec_get_nth_field(ibuf_rec, i + 4, &len);
+
+ dtype_new_read_for_order_and_null_size(&dtype,
+ types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ } else {
+ data = rec_get_nth_field(ibuf_rec, i + 2, &len);
- dtype_read_for_order_and_null_size(&dtype,
+ dtype_read_for_order_and_null_size(&dtype,
types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ }
if (len == UNIV_SQL_NULL) {
data_size += dtype_get_sql_null_size(&dtype);
@@ -1062,6 +1182,7 @@ ibuf_entry_build(
must be kept because we copy pointers to its
fields */
dtuple_t* entry, /* in: entry for a non-clustered index */
+ ulint space, /* in: space id */
ulint page_no,/* in: index page number where entry should
be inserted */
mem_heap_t* heap) /* in: heap into which to build */
@@ -1074,49 +1195,79 @@ ibuf_entry_build(
byte* buf2;
ulint i;
- /* We have to build a tuple whose first field is the page number,
- the second field contains the original type information for entry,
- and the rest of the fields are copied from entry. All fields
- in the tuple are of the type binary. */
+ /* Starting from 4.1.x, we have to build a tuple whose
+ (1) first field is the space id,
+ (2) the second field a single marker byte to tell that this
+ is a new format record,
+ (3) the third contains the page number, and
+ (4) the fourth contains the relevent type information of each data
+ field,
+ (5) and the rest of the fields are copied from entry. All fields
+ in the tuple are ordered like the type binary in our insert buffer
+ tree. */
n_fields = dtuple_get_n_fields(entry);
- tuple = dtuple_create(heap, n_fields + 2);
+ tuple = dtuple_create(heap, n_fields + 4);
- /* Store the page number in tuple */
+ /* Store the space id in tuple */
field = dtuple_get_nth_field(tuple, 0);
buf = mem_heap_alloc(heap, 4);
- mach_write_to_4(buf, page_no);
+ mach_write_to_4(buf, space);
dfield_set_data(field, buf, 4);
- /* Store the type info in tuple */
+ /* Store the marker byte field in tuple */
- buf2 = mem_heap_alloc(heap, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ field = dtuple_get_nth_field(tuple, 1);
- for (i = 0; i < n_fields; i++) {
+ buf = mem_heap_alloc(heap, 1);
- field = dtuple_get_nth_field(tuple, i + 2);
+ /* We set the marker byte zero */
- entry_field = dtuple_get_nth_field(entry, i);
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, 2);
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the type info in buf2, and add the fields from entry to
+ tuple */
+ buf2 = mem_heap_alloc(heap, n_fields
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ for (i = 0; i < n_fields; i++) {
+ /* We add 4 below because we have the 4 extra fields at the
+ start of an ibuf record */
+
+ field = dtuple_get_nth_field(tuple, i + 4);
+ entry_field = dtuple_get_nth_field(entry, i);
dfield_copy(field, entry_field);
- dtype_store_for_order_and_null_size(
- buf2 + i * DATA_ORDER_NULL_TYPE_BUF_SIZE,
+ dtype_new_store_for_order_and_null_size(
+ buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
dfield_get_type(entry_field));
}
- field = dtuple_get_nth_field(tuple, 1);
+ /* Store the type info in buf2 to field 3 of tuple */
- dfield_set_data(field, buf2, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ field = dtuple_get_nth_field(tuple, 3);
- /* Set the types in the new tuple binary */
+ dfield_set_data(field, buf2, n_fields
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ /* Set all the types in the new tuple binary */
- dtuple_set_types_binary(tuple, n_fields + 2);
+ dtuple_set_types_binary(tuple, n_fields + 4);
return(tuple);
}
@@ -1145,35 +1296,73 @@ ibuf_build_entry_from_ibuf_rec(
ulint len;
ulint i;
- n_fields = rec_get_n_fields(ibuf_rec) - 2;
+ data = rec_get_nth_field(ibuf_rec, 1, &len);
+
+ if (len > 1) {
+ /* This a < 4.1.x format record */
+
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ n_fields = rec_get_n_fields(ibuf_rec) - 2;
+ tuple = dtuple_create(heap, n_fields);
+ types = rec_get_nth_field(ibuf_rec, 1, &len);
+
+ ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = rec_get_nth_field(ibuf_rec, i + 2, &len);
+
+ dfield_set_data(field, data, len);
+
+ dtype_read_for_order_and_null_size(
+ dfield_get_type(field),
+ types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ }
+
+ return(tuple);
+ }
+
+ /* This a >= 4.1.x format record */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ ut_a(rec_get_n_fields(ibuf_rec) > 4);
+
+ n_fields = rec_get_n_fields(ibuf_rec) - 4;
tuple = dtuple_create(heap, n_fields);
- types = rec_get_nth_field(ibuf_rec, 1, &len);
+ types = rec_get_nth_field(ibuf_rec, 3, &len);
- ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
for (i = 0; i < n_fields; i++) {
- field = dtuple_get_nth_field(tuple, i);
+ field = dtuple_get_nth_field(tuple, i);
- data = rec_get_nth_field(ibuf_rec, i + 2, &len);
+ data = rec_get_nth_field(ibuf_rec, i + 4, &len);
dfield_set_data(field, data, len);
- dtype_read_for_order_and_null_size(dfield_get_type(field),
- types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ dtype_new_read_for_order_and_null_size(
+ dfield_get_type(field),
+ types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
}
return(tuple);
}
/*************************************************************************
-Builds a search tuple used to search buffered inserts for an index page. */
+Builds a search tuple used to search buffered inserts for an index page.
+This is for < 4.1.x format records */
static
dtuple_t*
ibuf_search_tuple_build(
/*====================*/
/* out, own: search tuple */
+ ulint space, /* in: space id */
ulint page_no,/* in: index page number */
mem_heap_t* heap) /* in: heap into which to build */
{
@@ -1181,6 +1370,10 @@ ibuf_search_tuple_build(
dfield_t* field;
byte* buf;
+ ut_a(space == 0);
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
tuple = dtuple_create(heap, 1);
/* Store the page number in tuple */
@@ -1199,6 +1392,61 @@ ibuf_search_tuple_build(
}
/*************************************************************************
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records. */
+static
+dtuple_t*
+ibuf_new_search_tuple_build(
+/*========================*/
+ /* out, own: search tuple */
+ ulint space, /* in: space id */
+ ulint page_no,/* in: index page number */
+ mem_heap_t* heap) /* in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ tuple = dtuple_create(heap, 3);
+
+ /* Store the space id in tuple */
+
+ field = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, space);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the new format record marker byte */
+
+ field = dtuple_get_nth_field(tuple, 1);
+
+ buf = mem_heap_alloc(heap, 1);
+
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, 2);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ dtuple_set_types_binary(tuple, 3);
+
+ return(tuple);
+}
+
+/*************************************************************************
Checks if there are enough pages in the free list of the ibuf tree that we
dare to start a pessimistic insert to the insert buffer. */
UNIV_INLINE
@@ -1267,6 +1515,8 @@ ibuf_add_free_page(
page_t* root;
page_t* bitmap_page;
+ ut_a(space == 0);
+
mtr_start(&mtr);
/* Acquire the fsp latch before the ibuf header, obeying the latching
@@ -1312,7 +1562,7 @@ ibuf_add_free_page(
page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST);
-
+
ibuf_data->seg_size++;
ibuf_data->free_list_len++;
@@ -1323,7 +1573,6 @@ ibuf_add_free_page(
ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF,
TRUE, &mtr);
-
mtr_commit(&mtr);
mutex_exit(&ibuf_mutex);
@@ -1350,6 +1599,8 @@ ibuf_remove_free_page(
page_t* root;
page_t* bitmap_page;
+ ut_a(space == 0);
+
mtr_start(&mtr);
/* Acquire the fsp latch before the ibuf header, obeying the latching
@@ -1461,6 +1712,13 @@ ibuf_free_excess_pages(
{
ibuf_data_t* ibuf_data;
ulint i;
+
+ if (space != 0) {
+ fprintf(stderr,
+"InnoDB: Error: calling ibuf_free_excess_pages for space %lu\n", (ulong) space);
+ return;
+ }
+
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(fil_space_get_latch(space), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
@@ -1515,8 +1773,12 @@ ibuf_get_merge_page_nos(
contract the tree, FALSE if this is called
when a single page becomes full and we look
if it pays to read also nearby pages */
- rec_t* first_rec,/* in: record from which we read down and
- up in the chain of records */
+ rec_t* first_rec,/* in: record from which we read up and down
+ in the chain of records */
+ ulint* space_ids,/* in/out: space id's of the pages */
+ ib_longlong* space_versions,/* in/out: tablespace version
+ timestamps; used to prevent reading in old
+ pages after DISCARD + IMPORT tablespace */
ulint* page_nos,/* in/out: buffer for at least
IBUF_MAX_N_PAGES_MERGED many page numbers;
the page numbers are in an ascending order */
@@ -1524,8 +1786,11 @@ ibuf_get_merge_page_nos(
page_nos in this function */
{
ulint prev_page_no;
+ ulint prev_space_id;
ulint first_page_no;
+ ulint first_space_id;
ulint rec_page_no;
+ ulint rec_space_id;
rec_t* rec;
ulint sum_volumes;
ulint volume_for_page;
@@ -1557,49 +1822,70 @@ ibuf_get_merge_page_nos(
rec = first_rec;
first_page_no = ibuf_rec_get_page_no(first_rec);
+ first_space_id = ibuf_rec_get_space(first_rec);
n_pages = 0;
prev_page_no = 0;
+ prev_space_id = 0;
+ /* Go backwards from the first_rec until we reach the border of the
+ 'merge area', or the page start or the limit of storeable pages is
+ reached */
+
while ((rec != page_get_infimum_rec(page)) && (n_pages < limit)) {
rec_page_no = ibuf_rec_get_page_no(rec);
+ rec_space_id = ibuf_rec_get_space(rec);
- ut_ad(rec_page_no != 0);
-
- if (rec_page_no / IBUF_MERGE_AREA
- != first_page_no / IBUF_MERGE_AREA) {
+ if (rec_space_id != first_space_id
+ || rec_page_no / IBUF_MERGE_AREA
+ != first_page_no / IBUF_MERGE_AREA) {
break;
}
- if (rec_page_no != prev_page_no) {
+ if (rec_page_no != prev_page_no
+ || rec_space_id != prev_space_id) {
n_pages++;
}
prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
rec = page_rec_get_prev(rec);
}
rec = page_rec_get_next(rec);
+ /* At the loop start there is no prev page; we mark this with a pair
+ of space id, page no (0, 0) for which there can never be entries in
+ the insert buffer */
+
prev_page_no = 0;
+ prev_space_id = 0;
sum_volumes = 0;
volume_for_page = 0;
while (*n_stored < limit) {
if (rec == page_get_supremum_rec(page)) {
+ /* When no more records available, mark this with
+ another 'impossible' pair of space id, page no */
rec_page_no = 1;
+ rec_space_id = 0;
} else {
rec_page_no = ibuf_rec_get_page_no(rec);
+ rec_space_id = ibuf_rec_get_space(rec);
ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO);
}
#ifdef UNIV_IBUF_DEBUG
ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
#endif
- if (rec_page_no != prev_page_no) {
- if ((prev_page_no == first_page_no)
+ if ((rec_space_id != prev_space_id
+ || rec_page_no != prev_page_no)
+ && (prev_space_id != 0 || prev_page_no != 0)) {
+
+ if ((prev_page_no == first_page_no
+ && prev_space_id == first_space_id)
|| contract
|| (volume_for_page >
((IBUF_MERGE_THRESHOLD - 1)
@@ -1607,6 +1893,10 @@ ibuf_get_merge_page_nos(
/ IBUF_PAGE_SIZE_PER_FREE_SPACE)
/ IBUF_MERGE_THRESHOLD)) {
+ space_ids[*n_stored] = prev_space_id;
+ space_versions[*n_stored]
+ = fil_space_get_version(
+ prev_space_id);
page_nos[*n_stored] = prev_page_no;
(*n_stored)++;
@@ -1614,8 +1904,9 @@ ibuf_get_merge_page_nos(
sum_volumes += volume_for_page;
}
- if (rec_page_no / IBUF_MERGE_AREA
- != first_page_no / IBUF_MERGE_AREA) {
+ if (rec_space_id != first_space_id
+ || rec_page_no / IBUF_MERGE_AREA
+ != first_page_no / IBUF_MERGE_AREA) {
break;
}
@@ -1623,7 +1914,7 @@ ibuf_get_merge_page_nos(
volume_for_page = 0;
}
- if (rec_page_no == 1) {
+ if (rec_page_no == 1 && rec_space_id == 0) {
/* Supremum record */
break;
@@ -1634,6 +1925,7 @@ ibuf_get_merge_page_nos(
volume_for_page += rec_volume;
prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
rec = page_rec_get_next(rec);
}
@@ -1666,6 +1958,8 @@ ibuf_contract_ext(
ulint space;
ibool all_trees_empty;
ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
+ ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
+ ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED];
ulint n_stored;
ulint sum_sizes;
mtr_t mtr;
@@ -1678,7 +1972,8 @@ loop:
ut_ad(ibuf_validate_low());
- /* Choose an ibuf tree at random */
+ /* Choose an ibuf tree at random (though there really is only one tree
+ in the current implementation) */
ibuf_rnd += 865558671;
rnd_pos = ibuf_rnd % ibuf->size;
@@ -1714,8 +2009,10 @@ loop:
ut_ad(data);
- space = (data->index)->space;
+ space = data->index->space;
+ ut_a(space == 0); /* We currently only have an ibuf tree in
+ space 0 */
mtr_start(&mtr);
ibuf_enter();
@@ -1744,8 +2041,8 @@ loop:
mutex_exit(&ibuf_mutex);
sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
- page_nos, &n_stored);
-
+ space_ids, space_versions, page_nos,
+ &n_stored);
#ifdef UNIV_IBUF_DEBUG
/* printf("Ibuf contract sync %lu pages %lu volume %lu\n", sync,
n_stored, sum_sizes); */
@@ -1755,8 +2052,8 @@ loop:
mtr_commit(&mtr);
btr_pcur_close(&pcur);
- buf_read_ibuf_merge_pages(sync, space, page_nos, n_stored);
-
+ buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos,
+ n_stored);
*n_pages = n_stored;
return(sum_sizes + 1);
@@ -1885,6 +2182,8 @@ ibuf_get_volume_buffered(
ulint next_page_no;
page_t* next_page;
+ ut_a(trx_sys_multiple_tablespace_format);
+
ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
|| (pcur->latch_mode == BTR_MODIFY_TREE));
@@ -1907,7 +2206,8 @@ ibuf_get_volume_buffered(
break;
}
- if (page_no != ibuf_rec_get_page_no(rec)) {
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
goto count_later;
}
@@ -1926,7 +2226,7 @@ ibuf_get_volume_buffered(
goto count_later;
}
- prev_page = buf_page_get(space, prev_page_no, RW_X_LATCH, mtr);
+ prev_page = buf_page_get(0, prev_page_no, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(prev_page, SYNC_TREE_NODE);
@@ -1945,7 +2245,8 @@ ibuf_get_volume_buffered(
return(UNIV_PAGE_SIZE);
}
- if (page_no != ibuf_rec_get_page_no(rec)) {
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
goto count_later;
}
@@ -1968,7 +2269,8 @@ count_later:
break;
}
- if (page_no != ibuf_rec_get_page_no(rec)) {
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
return(volume);
}
@@ -1987,7 +2289,7 @@ count_later:
return(volume);
}
- next_page = buf_page_get(space, next_page_no, RW_X_LATCH, mtr);
+ next_page = buf_page_get(0, next_page_no, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(next_page, SYNC_TREE_NODE);
@@ -2004,7 +2306,8 @@ count_later:
return(UNIV_PAGE_SIZE);
}
- if (page_no != ibuf_rec_get_page_no(rec)) {
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
return(volume);
}
@@ -2016,6 +2319,57 @@ count_later:
}
/*************************************************************************
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+ ulint max_space_id;
+ rec_t* rec;
+ byte* field;
+ ulint len;
+ ibuf_data_t* ibuf_data;
+ dict_index_t* ibuf_index;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ ibuf_data = fil_space_get_ibuf_data(0);
+
+ ibuf_index = ibuf_data->index;
+
+ ibuf_enter();
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(FALSE, ibuf_index, BTR_SEARCH_LEAF,
+ &pcur, TRUE, &mtr);
+ btr_pcur_move_to_prev(&pcur, &mtr);
+
+ if (btr_pcur_is_before_first_on_page(&pcur, &mtr)) {
+ /* The tree is empty */
+
+ max_space_id = 0;
+ } else {
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field(rec, 0, &len);
+
+ ut_a(len == 4);
+
+ max_space_id = mach_read_from_4(field);
+ }
+
+ mtr_commit(&mtr);
+ ibuf_exit();
+
+ /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+ fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+/*************************************************************************
Makes an index insert to the insert buffer, instead of directly to the disk
page, if this is possible. */
static
@@ -2035,8 +2389,6 @@ ibuf_insert_low(
ulint entry_size;
btr_pcur_t pcur;
btr_cur_t* cursor;
- mtr_t mtr;
- mtr_t bitmap_mtr;
dtuple_t* ibuf_entry;
mem_heap_t* heap;
ulint buffered;
@@ -2048,16 +2400,25 @@ ibuf_insert_low(
page_t* root;
ulint err;
ibool do_merge;
+ ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
+ ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED];
ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
ulint n_stored;
ulint bits;
+ mtr_t mtr;
+ mtr_t bitmap_mtr;
ut_a(!(index->type & DICT_CLUSTERED));
ut_ad(dtuple_check_typed(entry));
+ ut_a(trx_sys_multiple_tablespace_format);
+
do_merge = FALSE;
-
- ibuf_data = fil_space_get_ibuf_data(space);
+
+ /* Currently the insert buffer of space 0 takes care of inserts to all
+ tablespaces */
+
+ ibuf_data = fil_space_get_ibuf_data(0);
ibuf_index = ibuf_data->index;
@@ -2084,7 +2445,7 @@ ibuf_insert_low(
mutex_enter(&ibuf_pessimistic_insert_mutex);
ibuf_enter();
-
+
mutex_enter(&ibuf_mutex);
while (!ibuf_data_enough_free_for_insert(ibuf_data)) {
@@ -2095,7 +2456,7 @@ ibuf_insert_low(
mutex_exit(&ibuf_pessimistic_insert_mutex);
- err = ibuf_add_free_page(space, ibuf_data);
+ err = ibuf_add_free_page(0, ibuf_data);
if (err == DB_STRONG_FAIL) {
@@ -2120,7 +2481,7 @@ ibuf_insert_low(
the first fields and the type information for other fields, and which
will be inserted to the insert buffer. */
- ibuf_entry = ibuf_entry_build(entry, page_no, heap);
+ ibuf_entry = ibuf_entry_build(entry, space, page_no, heap);
/* Open a cursor to the insert buffer tree to calculate if we can add
the new entry to it without exceeding the free space limit for the
@@ -2145,7 +2506,6 @@ ibuf_insert_low(
if (buf_page_peek(space, page_no)
|| lock_rec_expl_exist_on_page(space, page_no)) {
-
err = DB_STRONG_FAIL;
mtr_commit(&bitmap_mtr);
@@ -2158,7 +2518,6 @@ ibuf_insert_low(
if (buffered + entry_size + page_dir_calc_reserved_space(1)
> ibuf_index_page_calc_free_from_bits(bits)) {
-
mtr_commit(&bitmap_mtr);
/* It may not fit */
@@ -2167,7 +2526,8 @@ ibuf_insert_low(
do_merge = TRUE;
ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur),
- page_nos, &n_stored);
+ space_ids, space_versions, page_nos,
+ &n_stored);
goto function_exit;
}
@@ -2203,10 +2563,10 @@ ibuf_insert_low(
which would cause the x-latching of the root after that to
break the latching order. */
- root = ibuf_tree_root_get(ibuf_data, space, &mtr);
+ root = ibuf_tree_root_get(ibuf_data, 0, &mtr);
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
- | BTR_NO_UNDO_LOG_FLAG,
+ | BTR_NO_UNDO_LOG_FLAG,
cursor,
ibuf_entry, &ins_rec,
&dummy_big_rec, thr,
@@ -2223,6 +2583,10 @@ ibuf_insert_low(
function_exit:
#ifdef UNIV_IBUF_DEBUG
if (err == DB_SUCCESS) {
+ printf(
+"Incrementing ibuf count of space %lu page %lu\n"
+"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no));
+
ibuf_count_set(space, page_no,
ibuf_count_get(space, page_no) + 1);
}
@@ -2257,7 +2621,8 @@ function_exit:
#ifdef UNIV_IBUF_DEBUG
ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
#endif
- buf_read_ibuf_merge_pages(FALSE, space, page_nos, n_stored);
+ buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions,
+ page_nos, n_stored);
}
return(err);
@@ -2280,6 +2645,7 @@ ibuf_insert(
{
ulint err;
+ ut_a(trx_sys_multiple_tablespace_format);
ut_ad(dtuple_check_typed(entry));
ut_a(!(index->type & DICT_CLUSTERED));
@@ -2332,6 +2698,26 @@ ibuf_insert_to_index_page(
ut_ad(ibuf_inside());
ut_ad(dtuple_check_typed(entry));
+ if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page)))
+ != dtuple_get_n_fields(entry)) {
+
+ fprintf(stderr,
+"InnoDB: Trying to insert a record from the insert buffer to an index page\n"
+"InnoDB: but the number of fields does not match!\n%s\n", errbuf);
+
+ buf_page_print(page);
+
+ dtuple_sprintf(errbuf, 900, entry);
+
+ fprintf(stderr,
+"InnoDB: The table where where this index record belongs\n"
+"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
+"InnoDB: your tables.\n"
+"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n");
+
+ return;
+ }
+
low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur);
if (low_match == dtuple_get_n_fields(entry)) {
@@ -2355,39 +2741,34 @@ ibuf_insert_to_index_page(
fprintf(stderr,
"InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n",
- page_get_max_insert_size(page, 1),
- rec_get_converted_size(entry));
+ (ulong) page_get_max_insert_size(page, 1),
+ (ulong) rec_get_converted_size(entry));
dtuple_sprintf(errbuf, 900, entry);
fprintf(stderr,
-"InnoDB: Cannot insert index record %s\n", errbuf);
-
- fprintf(stderr,
+"InnoDB: Cannot insert index record %s\n"
"InnoDB: The table where where this index record belongs\n"
"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
-"InnoDB: that table.\n");
-
+"InnoDB: that table.\n", errbuf);
bitmap_page = ibuf_bitmap_get_map_page(
buf_frame_get_space_id(page),
buf_frame_get_page_no(page),
mtr);
-
old_bits = ibuf_bitmap_page_get_bits(
bitmap_page,
buf_frame_get_page_no(page),
IBUF_BITMAP_FREE, mtr);
- fprintf(stderr, "Bitmap bits %lu\n", old_bits);
+ fprintf(stderr, "Bitmap bits %lu\n", (ulong) old_bits);
fprintf(stderr,
"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n");
-
}
}
}
}
-
+
/*************************************************************************
Deletes from ibuf the record on which pcur is positioned. If we have to
resort to a pessimistic delete, this function commits mtr and closes
@@ -2411,13 +2792,16 @@ ibuf_delete_rec(
ibuf_data_t* ibuf_data;
page_t* root;
ulint err;
-
+
ut_ad(ibuf_inside());
success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr);
if (success) {
#ifdef UNIV_IBUF_DEBUG
+ printf(
+"Decrementing ibuf count of space %lu page %lu\n"
+"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no));
ibuf_count_set(space, page_no,
ibuf_count_get(space, page_no) - 1);
#endif
@@ -2429,7 +2813,10 @@ ibuf_delete_rec(
btr_pcur_commit_specify_mtr(pcur, mtr);
- ibuf_data = fil_space_get_ibuf_data(space);
+ /* Currently the insert buffer of space 0 takes care of inserts to all
+ tablespaces */
+
+ ibuf_data = fil_space_get_ibuf_data(0);
mutex_enter(&ibuf_mutex);
@@ -2439,10 +2826,10 @@ ibuf_delete_rec(
if (!success) {
fprintf(stderr,
- "InnoDB: ERROR: Send the output to heikki.tuuri@innodb.com\n");
- fprintf(stderr, "InnoDB: ibuf cursor restoration fails!\n");
- fprintf(stderr, "InnoDB: ibuf record inserted to page %lu\n",
- page_no);
+"InnoDB: ERROR: Send the output to mysql@lists.mysql.com\n"
+"InnoDB: ibuf cursor restoration fails!\n"
+"InnoDB: ibuf record inserted to space %lu page %lu\n", (ulong) space,
+ (ulong) page_no);
fflush(stderr);
rec_print(btr_pcur_get_rec(pcur));
@@ -2452,18 +2839,23 @@ ibuf_delete_rec(
rec_print(page_rec_get_next(btr_pcur_get_rec(pcur)));
fflush(stdout);
- mtr_commit(mtr);
+ btr_pcur_commit_specify_mtr(pcur, mtr);
- fprintf(stderr, "InnoDB: Validating insert buffer tree:\n");
+ fprintf(stderr,
+ "InnoDB: Validating insert buffer tree:\n");
ut_a(btr_validate_tree(ibuf_data->index->tree));
fprintf(stderr, "InnoDB: ibuf tree ok\n");
fflush(stderr);
+
+ btr_pcur_close(pcur);
+
+ mutex_exit(&ibuf_mutex);
+
+ return(TRUE);
}
-
- ut_a(success);
- root = ibuf_tree_root_get(ibuf_data, space, mtr);
+ root = ibuf_tree_root_get(ibuf_data, 0, mtr);
btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur),
FALSE, mtr);
@@ -2499,7 +2891,11 @@ ibuf_merge_or_delete_for_page(
page_t* page, /* in: if page has been read from disk, pointer to
the page x-latched, else NULL */
ulint space, /* in: space id of the index page */
- ulint page_no)/* in: page number of the index page */
+ ulint page_no,/* in: page number of the index page */
+ ibool update_ibuf_bitmap)/* in: normally this is set to TRUE, but if
+ we have deleted or are deleting the tablespace, then we
+ naturally do not want to update a non-existent bitmap
+ page */
{
mem_heap_t* heap;
btr_pcur_t pcur;
@@ -2516,6 +2912,7 @@ ibuf_merge_or_delete_for_page(
ulint old_bits;
ulint new_bits;
dulint max_trx_id;
+ ibool tablespace_being_deleted = FALSE;
ibool corruption_noticed = FALSE;
mtr_t mtr;
char err_buf[500];
@@ -2524,7 +2921,7 @@ ibuf_merge_or_delete_for_page(
return;
}
-
+
#ifdef UNIV_LOG_DEBUG
if (space % 2 != 0) {
@@ -2538,28 +2935,57 @@ ibuf_merge_or_delete_for_page(
return;
}
- mtr_start(&mtr);
+ if (update_ibuf_bitmap) {
+ /* If the following returns FALSE, we get the counter
+ incremented, and must decrement it when we leave this
+ function. When the counter is > 0, that prevents tablespace
+ from being dropped. */
- bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr);
+ tablespace_being_deleted = fil_inc_pending_ibuf_merges(space);
+
+ if (tablespace_being_deleted) {
+ /* Do not try to read the bitmap page from space;
+ just delete the ibuf records for the page */
+
+ page = NULL;
+ update_ibuf_bitmap = FALSE;
+ }
+ }
+
+ if (update_ibuf_bitmap) {
+ mtr_start(&mtr);
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr);
- if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no,
+ if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no,
IBUF_BITMAP_BUFFERED, &mtr)) {
- /* No inserts buffered for this page */
+ /* No inserts buffered for this page */
+ mtr_commit(&mtr);
- mtr_commit(&mtr);
+ if (!tablespace_being_deleted) {
+ fil_decr_pending_ibuf_merges(space);
+ }
- return;
+ return;
+ }
+ mtr_commit(&mtr);
}
- mtr_commit(&mtr);
+ /* Currently the insert buffer of space 0 takes care of inserts to all
+ tablespaces */
- ibuf_data = fil_space_get_ibuf_data(space);
+ ibuf_data = fil_space_get_ibuf_data(0);
ibuf_enter();
heap = mem_heap_create(512);
- search_tuple = ibuf_search_tuple_build(page_no, heap);
+ if (!trx_sys_multiple_tablespace_format) {
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ search_tuple = ibuf_search_tuple_build(space, page_no, heap);
+ } else {
+ search_tuple = ibuf_new_search_tuple_build(space, page_no,
+ heap);
+ }
if (page) {
/* Move the ownership of the x-latch on the page to this OS
@@ -2600,7 +3026,8 @@ ibuf_merge_or_delete_for_page(
"InnoDB: to determine if they are corrupt after this.\n\n"
"InnoDB: Please make a detailed bug report and send it to\n"
"InnoDB: mysql@lists.mysql.com\n\n",
- page_no, fil_page_get_type(page));
+ (ulong) page_no,
+ (ulong) fil_page_get_type(page));
}
}
@@ -2624,7 +3051,6 @@ loop:
index page */
btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE,
BTR_MODIFY_LEAF, &pcur, &mtr);
-
if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) {
ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
@@ -2637,29 +3063,18 @@ loop:
ibuf_rec = btr_pcur_get_rec(&pcur);
/* Check if the entry is for this index page */
- if (ibuf_rec_get_page_no(ibuf_rec) != page_no) {
-
+ if (ibuf_rec_get_page_no(ibuf_rec) != page_no
+ || ibuf_rec_get_space(ibuf_rec) != space) {
if (page) {
page_header_reset_last_insert(page, &mtr);
}
-
goto reset_bit;
}
- /* Do NOT merge to the 4.1 code base! */
- if (trx_sys_downgrading_from_4_1_1) {
- fprintf(stderr,
-"InnoDB: Fatal error: you are downgrading from >= 4.1.1 to 4.0, but\n"
-"InnoDB: the insert buffer was not empty.\n");
- ut_error;
- }
-
if (corruption_noticed) {
rec_sprintf(err_buf, 450, ibuf_rec);
-
fprintf(stderr,
"InnoDB: Discarding record\n %s\n from the insert buffer!\n\n", err_buf);
-
} else if (page) {
/* Now we have at pcur a record which should be
inserted to the index page; NOTE that the call below
@@ -2669,14 +3084,12 @@ loop:
max_trx_id = page_get_max_trx_id(
buf_frame_align(ibuf_rec));
-
page_update_max_trx_id(page, max_trx_id);
entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap);
#ifdef UNIV_IBUF_DEBUG
volume += rec_get_converted_size(entry)
+ page_dir_calc_reserved_space(1);
-
ut_a(volume <= 4 * UNIV_PAGE_SIZE
/ IBUF_PAGE_SIZE_PER_FREE_SPACE);
#endif
@@ -2704,43 +3117,38 @@ loop:
}
reset_bit:
-
#ifdef UNIV_IBUF_DEBUG
if (ibuf_count_get(space, page_no) > 0) {
-
/* btr_print_tree(ibuf_data->index->tree, 100);
ibuf_print(); */
}
#endif
- bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr);
-
- ibuf_bitmap_page_set_bits(bitmap_page, page_no,
+ if (update_ibuf_bitmap) {
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr);
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no,
IBUF_BITMAP_BUFFERED, FALSE, &mtr);
- if (page) {
- old_bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no,
- IBUF_BITMAP_FREE, &mtr);
- new_bits = ibuf_index_page_calc_free(page);
-
+ if (page) {
+ old_bits = ibuf_bitmap_page_get_bits(bitmap_page,
+ page_no, IBUF_BITMAP_FREE, &mtr);
+ new_bits = ibuf_index_page_calc_free(page);
#ifdef UNIV_IBUF_DEBUG
- /* printf("Old bits %lu new bits %lu max size %lu\n", old_bits,
- new_bits,
+ /* printf("Old bits %lu new bits %lu max size %lu\n",
+ old_bits, new_bits,
page_get_max_insert_size_after_reorganize(page, 1)); */
#endif
- if (old_bits != new_bits) {
-
- ibuf_bitmap_page_set_bits(bitmap_page, page_no,
+ if (old_bits != new_bits) {
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no,
IBUF_BITMAP_FREE,
new_bits, &mtr);
+ }
}
}
-
#ifdef UNIV_IBUF_DEBUG
/* printf("Ibuf merge %lu records volume %lu to page no %lu\n",
n_inserts, volume, page_no); */
#endif
mtr_commit(&mtr);
btr_pcur_close(&pcur);
-
mem_heap_free(heap);
/* Protect our statistics keeping from race conditions */
@@ -2751,12 +3159,123 @@ reset_bit:
mutex_exit(&ibuf_mutex);
+ if (update_ibuf_bitmap && !tablespace_being_deleted) {
+
+ fil_decr_pending_ibuf_merges(space);
+ }
+
ibuf_exit();
#ifdef UNIV_IBUF_DEBUG
ut_a(ibuf_count_get(space, page_no) == 0);
#endif
}
+/*************************************************************************
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+ ulint space) /* in: space id */
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ dtuple_t* search_tuple;
+ rec_t* ibuf_rec;
+ ulint page_no;
+ ibool closed;
+ ibuf_data_t* ibuf_data;
+ ulint n_inserts;
+ mtr_t mtr;
+
+ /* Currently the insert buffer of space 0 takes care of inserts to all
+ tablespaces */
+
+ ibuf_data = fil_space_get_ibuf_data(0);
+
+ heap = mem_heap_create(512);
+
+ /* Use page number 0 to build the search tuple so that we get the
+ cursor positioned at the first entry for this space id */
+
+ search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
+
+ n_inserts = 0;
+loop:
+ ibuf_enter();
+
+ mtr_start(&mtr);
+
+ /* Position pcur in the insert buffer at the first entry for the
+ space */
+ btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE,
+ BTR_MODIFY_LEAF, &pcur, &mtr);
+ if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) {
+ ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+ goto leave_loop;
+ }
+
+ for (;;) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur, &mtr));
+
+ ibuf_rec = btr_pcur_get_rec(&pcur);
+
+ /* Check if the entry is for this space */
+ if (ibuf_rec_get_space(ibuf_rec) != space) {
+
+ goto leave_loop;
+ }
+
+ page_no = ibuf_rec_get_page_no(ibuf_rec);
+
+ n_inserts++;
+
+ /* Delete the record from ibuf */
+ closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+ &mtr);
+ if (closed) {
+ /* Deletion was pessimistic and mtr was committed:
+ we start from the beginning again */
+
+ ibuf_exit();
+
+ goto loop;
+ }
+
+ if (btr_pcur_is_after_last_on_page(&pcur, &mtr)) {
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ ibuf_exit();
+
+ goto loop;
+ }
+ }
+
+leave_loop:
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ /* Protect our statistics keeping from race conditions */
+ mutex_enter(&ibuf_mutex);
+
+ ibuf_data->n_merges++;
+ ibuf_data->n_merged_recs += n_inserts;
+
+ mutex_exit(&ibuf_mutex);
+
+ printf("Discarded %lu ibuf entries for space %lu\n", (ulong) n_inserts,
+ (ulong) space);
+
+ ibuf_exit();
+
+ mem_heap_free(heap);
+}
+
/**********************************************************************
Validates the ibuf data structures when the caller owns ibuf_mutex. */
@@ -2788,6 +3307,56 @@ ibuf_validate_low(void)
}
/**********************************************************************
+Looks if the insert buffer is empty. */
+
+ibool
+ibuf_is_empty(void)
+/*===============*/
+ /* out: TRUE if empty */
+{
+ ibuf_data_t* data;
+ ibool is_empty;
+ page_t* root;
+ mtr_t mtr;
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ data = UT_LIST_GET_FIRST(ibuf->data_list);
+
+ mtr_start(&mtr);
+
+ root = ibuf_tree_root_get(data, 0, &mtr);
+
+ if (page_get_n_recs(root) == 0) {
+
+ is_empty = TRUE;
+
+ if (data->empty == FALSE) {
+ fprintf(stderr,
+"InnoDB: Warning: insert buffer tree is empty but the data struct does not\n"
+"InnoDB: know it. This condition is legal if the master thread has not yet\n"
+"InnoDB: run to completion.\n");
+ }
+ } else {
+ ut_a(data->empty == FALSE);
+
+ is_empty = FALSE;
+ }
+
+ mtr_commit(&mtr);
+
+ ut_a(data->space == 0);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ return(is_empty);
+}
+
+/**********************************************************************
Prints info of ibuf. */
void
@@ -2810,18 +3379,29 @@ ibuf_print(
while (data) {
buf += sprintf(buf,
- "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,\n",
- data->space, data->size, data->free_list_len, data->seg_size);
+ "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,",
+ (ulong) data->space, (ulong) data->size,
+ (ulong) data->free_list_len,
+ (ulong) data->seg_size);
+
+ if (data->empty) {
+ buf += sprintf(buf, " is empty\n");
+ } else {
+ buf += sprintf(buf, " is not empty\n");
+ }
buf += sprintf(buf,
"%lu inserts, %lu merged recs, %lu merges\n",
- data->n_inserts, data->n_merged_recs, data->n_merges);
+ (ulong) data->n_inserts,
+ (ulong) data->n_merged_recs,
+ (ulong) data->n_merges);
#ifdef UNIV_IBUF_DEBUG
for (i = 0; i < IBUF_COUNT_N_PAGES; i++) {
if (ibuf_count_get(data->space, i) > 0) {
printf("Ibuf count for page %lu is %lu\n",
- i, ibuf_count_get(data->space, i));
+ (ulong) i,
+ (ulong) ibuf_count_get(data->space, i));
}
}
#endif
diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic
index fd66c7bf2a3..b0aa0756307 100644
--- a/innobase/include/btr0btr.ic
+++ b/innobase/include/btr0btr.ic
@@ -188,6 +188,7 @@ btr_node_ptr_get_child_page_no(
ulint n_fields;
byte* field;
ulint len;
+ ulint page_no;
n_fields = rec_get_n_fields(rec);
@@ -196,7 +197,16 @@ btr_node_ptr_get_child_page_no(
ut_ad(len == 4);
- return(mach_read_from_4(field));
+ page_no = mach_read_from_4(field);
+
+ if (page_no == 0) {
+ fprintf(stderr,
+"InnoDB: a nonsensical page number 0 in a node ptr record at offset %lu\n",
+ (unsigned long)(rec - buf_frame_align(rec)));
+ buf_page_print(buf_frame_align(rec));
+ }
+
+ return(page_no);
}
/******************************************************************
diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h
index 9d07dd0de18..81f19af4d40 100644
--- a/innobase/include/btr0pcur.h
+++ b/innobase/include/btr0pcur.h
@@ -466,6 +466,9 @@ struct btr_pcur_struct{
BTR_PCUR_AFTER, depending on whether
cursor was on, before, or after the
old_rec record */
+ buf_block_t* block_when_stored;/* buffer block when the position was
+ stored; note that if AWE is on, frames
+ may move */
dulint modify_clock; /* the modify clock value of the
buffer block when the cursor position
was stored */
diff --git a/innobase/include/btr0pcur.ic b/innobase/include/btr0pcur.ic
index a1db2cc52dd..b553a569bda 100644
--- a/innobase/include/btr0pcur.ic
+++ b/innobase/include/btr0pcur.ic
@@ -564,7 +564,7 @@ btr_pcur_open_at_index_side(
}
btr_cur_open_at_index_side(from_left, index, latch_mode,
- btr_pcur_get_btr_cur(pcur), mtr);
+ btr_pcur_get_btr_cur(pcur), mtr);
pcur->pos_state = BTR_PCUR_IS_POSITIONED;
pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
index 72cedafa7e1..3cab717546a 100644
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri
#include "sync0rw.h"
#include "hash0hash.h"
#include "ut0byte.h"
+#include "os0proc.h"
/* Flags for flush types */
#define BUF_FLUSH_LRU 1
@@ -58,23 +59,34 @@ extern ibool buf_debug_prints;/* If this is set TRUE, the program
occurs */
/************************************************************************
-Initializes the buffer pool of the database. */
+Creates the buffer pool. */
-void
+buf_pool_t*
buf_pool_init(
/*==========*/
- ulint max_size, /* in: maximum size of the pool in blocks */
- ulint curr_size); /* in: current size to use, must be <=
+ /* out, own: buf_pool object, NULL if not
+ enough memory or error */
+ ulint max_size, /* in: maximum size of the buf_pool in
+ blocks */
+ ulint curr_size, /* in: current size to use, must be <=
+ max_size, currently must be equal to
max_size */
+ ulint n_frames); /* in: number of frames; if AWE is used,
+ this is the size of the address space window
+ where physical memory pages are mapped; if
+ AWE is not used then this must be the same
+ as max_size */
/*************************************************************************
-Gets the current size of buffer pool in bytes. */
+Gets the current size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_curr_size(void);
/*========================*/
/* out: size in bytes */
/*************************************************************************
-Gets the maximum size of buffer pool in bytes. */
+Gets the maximum size of buffer pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_max_size(void);
@@ -138,8 +150,8 @@ improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */
NOTE! The following macros should be used instead of
buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
RW_X_LATCH are allowed as LA! */
-#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\
- LA, G, MC, IB__FILE__, __LINE__, MTR)
+#define buf_page_optimistic_get(LA, BL, G, MC, MTR) buf_page_optimistic_get_func(\
+ LA, BL, G, MC, IB__FILE__, __LINE__, MTR)
/************************************************************************
This is the general function used to get optimistic access to a database
page. */
@@ -149,7 +161,9 @@ buf_page_optimistic_get_func(
/*=========================*/
/* out: TRUE if success */
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
- buf_frame_t* guess, /* in: guessed frame */
+ buf_block_t* block, /* in: guessed block */
+ buf_frame_t* guess, /* in: guessed frame; note that AWE may move
+ frames */
dulint modify_clock,/* in: modify clock value if mode is
..._GUESS_ON_CLOCK */
char* file, /* in: file name */
@@ -350,6 +364,16 @@ buf_frame_modify_clock_inc(
/* out: new value */
buf_frame_t* frame); /* in: pointer to a frame */
/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_block_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_block_t* block); /* in: block */
+/************************************************************************
Returns the value of the modify clock. The caller must have an s-lock
or x-lock on the block. */
UNIV_INLINE
@@ -441,7 +465,7 @@ UNIV_INLINE
buf_frame_t*
buf_frame_align(
/*============*/
- /* out: pointer to block */
+ /* out: pointer to frame */
byte* ptr); /* in: pointer to a frame */
/***********************************************************************
Checks if a pointer points to the block array of the buffer pool (blocks, not
@@ -525,6 +549,19 @@ buf_pool_invalidate(void);
--------------------------- LOWER LEVEL ROUTINES -------------------------
=========================================================================*/
+/************************************************************************
+Maps the page of block to a frame, if not mapped yet. Unmaps some page
+from the end of the awe_LRU_free_mapped. */
+
+void
+buf_awe_map_page_to_frame(
+/*======================*/
+ buf_block_t* block, /* in: block whose page should be
+ mapped to a frame */
+ ibool add_to_mapped_list);/* in: TRUE if we in the case
+ we need to map the page should also
+ add the block to the
+ awe_LRU_free_mapped list */
#ifdef UNIV_SYNC_DEBUG
/*************************************************************************
Adds latch level info for the rw-lock protecting the buffer frame. This
@@ -590,19 +627,27 @@ buf_pool_get_nth_block(
ulint i); /* in: index of the block */
/************************************************************************
Function which inits a page for read to the buffer buf_pool. If the page is
-already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and
-sets a non-recursive exclusive lock on the buffer frame. The io-handler must
-take care that the flag is cleared and the lock released later. This is one
-of the functions which perform the state transition NOT_USED => FILE_PAGE to
-a block (the other is buf_page_create). */
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later. This is one of the functions which perform the
+state transition NOT_USED => FILE_PAGE to a block (the other is
+buf_page_create). */
buf_block_t*
buf_page_init_for_read(
/*===================*/
- /* out: pointer to the block */
- ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
- ulint space, /* in: space id */
- ulint offset);/* in: page number */
+ /* out: pointer to the block or NULL */
+ ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /* in: space id */
+ ib_longlong tablespace_version,/* in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset);/* in: page number */
/************************************************************************
Completes an asynchronous read or write request of a file page to or from
the buffer pool. */
@@ -659,7 +704,16 @@ struct buf_block_struct{
byte* frame; /* pointer to buffer frame which
is of size UNIV_PAGE_SIZE, and
aligned to an address divisible by
- UNIV_PAGE_SIZE */
+ UNIV_PAGE_SIZE; if AWE is used, this
+ will be NULL for the pages which are
+ currently not mapped into the virtual
+ address space window of the buffer
+ pool */
+ os_awe_t* awe_info; /* if AWE is used, then an array of
+ awe page infos for
+ UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE
+ (normally = 4) physical memory
+ pages; otherwise NULL */
ulint space; /* space id of the page */
ulint offset; /* page number within the space */
ulint lock_hash_val; /* hashed value of the page address
@@ -668,14 +722,6 @@ struct buf_block_struct{
record lock hash table */
rw_lock_t lock; /* read-write lock of the buffer
frame */
- rw_lock_t read_lock; /* rw-lock reserved when a page read
- to the frame is requested; a thread
- can wait for this rw-lock if it wants
- to wait for the read to complete;
- the usual way is to wait for lock,
- but if the thread just wants a
- bufferfix and no latch on the page,
- then it can wait for this rw-lock */
buf_block_t* hash; /* node used in chaining to the page
hash table */
ibool check_index_page_at_flush;
@@ -710,8 +756,16 @@ struct buf_block_struct{
UT_LIST_NODE_T(buf_block_t) free;
/* node of the free block list */
+ ibool in_free_list; /* TRUE if in the free list; used in
+ debugging */
UT_LIST_NODE_T(buf_block_t) LRU;
/* node of the LRU list */
+ UT_LIST_NODE_T(buf_block_t) awe_LRU_free_mapped;
+ /* in the AWE version node in the
+ list of free and LRU blocks which are
+ mapped to a frame */
+ ibool in_LRU_list; /* TRUE of the page is in the LRU list;
+ used in debugging */
ulint LRU_position; /* value which monotonically
decreases (or may stay constant if
the block is in the old blocks) toward
@@ -772,6 +826,9 @@ struct buf_block_struct{
complete, though: there may have been
hash collisions, record deletions,
etc. */
+ ulint n_pointers; /* used in debugging: the number of
+ pointers in the adaptive hash index
+ pointing to this frame */
ulint curr_n_fields; /* prefix length for hash indexing:
number of full fields */
ulint curr_n_bytes; /* number of bytes in hash indexing */
@@ -803,16 +860,36 @@ struct buf_pool_struct{
struct and control blocks, except the
read-write lock in them */
byte* frame_mem; /* pointer to the memory area which
- was allocated for the frames */
+ was allocated for the frames; in AWE
+ this is the virtual address space
+ window where we map pages stored
+ in physical memory */
byte* frame_zero; /* pointer to the first buffer frame:
this may differ from frame_mem, because
this is aligned by the frame size */
- byte* high_end; /* pointer to the end of the
- buffer pool */
+ byte* high_end; /* pointer to the end of the buffer
+ frames */
+ ulint n_frames; /* number of frames */
buf_block_t* blocks; /* array of buffer control blocks */
+ buf_block_t** blocks_of_frames;/* inverse mapping which can be used
+ to retrieve the buffer control block
+ of a frame; this is an array which
+ lists the blocks of frames in the
+ order frame_zero,
+ frame_zero + UNIV_PAGE_SIZE, ...
+ a control block is always assigned
+ for each frame, even if the frame does
+ not contain any data; note that in AWE
+ there are more control blocks than
+ buffer frames */
+ os_awe_t* awe_info; /* if AWE is used, AWE info for the
+ physical 4 kB memory pages associated
+ with buffer frames */
ulint max_size; /* number of control blocks ==
maximum pool size in pages */
- ulint curr_size; /* current pool size in pages */
+ ulint curr_size; /* current pool size in pages;
+ currently always the same as
+ max_size */
hash_table_t* page_hash; /* hash table of the file pages */
ulint n_pend_reads; /* number of pending read operations */
@@ -829,6 +906,9 @@ struct buf_pool_struct{
counted as page gets; this field
is NOT protected by the buffer
pool mutex */
+ ulint n_pages_awe_remapped; /* if AWE is enabled, the
+ number of remaps of blocks to
+ buffer frames */
ulint n_page_gets_old;/* n_page_gets when buf_print was
last time called: used to calculate
hit rate */
@@ -837,6 +917,7 @@ struct buf_pool_struct{
ulint n_pages_written_old;/* number write operations */
ulint n_pages_created_old;/* number of pages created in
the pool with no read */
+ ulint n_pages_awe_remapped_old;
/* 2. Page flushing algorithm fields */
UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
@@ -869,7 +950,10 @@ struct buf_pool_struct{
/* 3. LRU replacement algorithm fields */
UT_LIST_BASE_NODE_T(buf_block_t) free;
- /* base node of the free block list */
+ /* base node of the free block list;
+ in the case of AWE, at the start are
+ always free blocks for which the
+ physical memory is mapped to a frame */
UT_LIST_BASE_NODE_T(buf_block_t) LRU;
/* base node of the LRU list */
buf_block_t* LRU_old; /* pointer to the about 3/8 oldest
@@ -881,6 +965,12 @@ struct buf_pool_struct{
see buf0lru.c for the restrictions
on this value; not defined if
LRU_old == NULL */
+ UT_LIST_BASE_NODE_T(buf_block_t) awe_LRU_free_mapped;
+ /* list of those blocks which are
+ in the LRU list or the free list, and
+ where the page is mapped to a frame;
+ thus, frames allocated, e.g., to the
+ locki table, are not in this list */
};
/* States of a control block */
diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic
index 5a4c56b0c30..cb54785128f 100644
--- a/innobase/include/buf0buf.ic
+++ b/innobase/include/buf0buf.ic
@@ -28,7 +28,6 @@ buf_block_peek_if_too_old(
{
if (buf_pool->freed_page_clock >= block->freed_page_clock
+ 1 + (buf_pool->curr_size / 1024)) {
-
return(TRUE);
}
@@ -36,25 +35,27 @@ buf_block_peek_if_too_old(
}
/*************************************************************************
-Gets the current size of buffer buf_pool in bytes. */
+Gets the current size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_curr_size(void)
/*========================*/
/* out: size in bytes */
{
- return((buf_pool->curr_size) * UNIV_PAGE_SIZE);
+ return((buf_pool->n_frames) * UNIV_PAGE_SIZE);
}
/*************************************************************************
-Gets the maximum size of buffer buf_pool in bytes. */
+Gets the maximum size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_max_size(void)
/*=======================*/
/* out: size in bytes */
{
- return((buf_pool->max_size) * UNIV_PAGE_SIZE);
+ return((buf_pool->n_frames) * UNIV_PAGE_SIZE);
}
/***********************************************************************
@@ -169,7 +170,7 @@ buf_block_get_space(
ut_ad(block);
ut_ad(block >= buf_pool->blocks);
ut_ad(block < buf_pool->blocks + buf_pool->max_size);
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_ad(block->buf_fix_count > 0);
return(block->space);
@@ -187,7 +188,7 @@ buf_block_get_page_no(
ut_ad(block);
ut_ad(block >= buf_pool->blocks);
ut_ad(block < buf_pool->blocks + buf_pool->max_size);
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_ad(block->buf_fix_count > 0);
return(block->offset);
@@ -209,54 +210,24 @@ buf_block_align(
frame_zero = buf_pool->frame_zero;
- ut_ad((ulint)ptr >= (ulint)frame_zero);
-
- block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero))
- >> UNIV_PAGE_SIZE_SHIFT);
- if (block < buf_pool->blocks
- || block >= buf_pool->blocks + buf_pool->max_size) {
+ if ((ulint)ptr < (ulint)frame_zero
+ || (ulint)ptr > (ulint)(buf_pool->high_end)) {
+ ut_print_timestamp(stderr);
fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)frame_zero, buf_pool->max_size);
+" InnoDB: Error: trying to access a stray pointer %lx\n"
+"InnoDB: buf pool start is at %lx, end at %lx\n"
+"InnoDB: Probable reason is database corruption or memory\n"
+"InnoDB: corruption. If this happens in an InnoDB database recovery,\n"
+"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n"
+"InnoDB: how to force recovery.\n",
+ (long)ptr, (long)frame_zero,
+ (long)(buf_pool->high_end));
ut_error;
}
-
- return(block);
-}
-
-/***********************************************************************
-Gets the block to whose frame the pointer is pointing to. Does not
-require a file page to be bufferfixed. */
-UNIV_INLINE
-buf_block_t*
-buf_block_align_low(
-/*================*/
- /* out: pointer to block */
- byte* ptr) /* in: pointer to a frame */
-{
- buf_block_t* block;
- buf_frame_t* frame_zero;
-
- ut_ad(ptr);
-
- frame_zero = buf_pool->frame_zero;
-
- ut_ad((ulint)ptr >= (ulint)frame_zero);
-
- block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero))
- >> UNIV_PAGE_SIZE_SHIFT);
- if (block < buf_pool->blocks
- || block >= buf_pool->blocks + buf_pool->max_size) {
-
- fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)frame_zero, buf_pool->max_size);
- ut_error;
- }
-
+
+ block = *(buf_pool->blocks_of_frames + (((ulint)(ptr - frame_zero))
+ >> UNIV_PAGE_SIZE_SHIFT));
return(block);
}
@@ -266,7 +237,7 @@ UNIV_INLINE
buf_frame_t*
buf_frame_align(
/*============*/
- /* out: pointer to block */
+ /* out: pointer to frame */
byte* ptr) /* in: pointer to a frame */
{
buf_frame_t* frame;
@@ -275,14 +246,19 @@ buf_frame_align(
frame = ut_align_down(ptr, UNIV_PAGE_SIZE);
- if (((ulint)frame
- < (ulint)(buf_pool->frame_zero))
- || ((ulint)frame > (ulint)(buf_pool_get_nth_block(buf_pool,
- buf_pool->max_size - 1)->frame))) {
+ if (((ulint)frame < (ulint)(buf_pool->frame_zero))
+ || (ulint)frame >= (ulint)(buf_pool->high_end)) {
+
+ ut_print_timestamp(stderr);
fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)(buf_pool->frame_zero), buf_pool->max_size);
+" InnoDB: Error: trying to access a stray pointer %lx\n"
+"InnoDB: buf pool start is at %lx, end at %lx\n"
+"InnoDB: Probable reason is database corruption or memory\n"
+"InnoDB: corruption. If this happens in an InnoDB database recovery,\n"
+"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n"
+"InnoDB: how to force recovery.\n",
+ (long)ptr, (long)(buf_pool->frame_zero),
+ (long)(buf_pool->high_end));
ut_error;
}
@@ -471,8 +447,29 @@ buf_frame_modify_clock_inc(
ut_ad(frame);
- block = buf_block_align_low(frame);
+ block = buf_block_align(frame);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /*UNIV_SYNC_DEBUG */
+
+ UT_DULINT_INC(block->modify_clock);
+ return(block->modify_clock);
+}
+
+/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_block_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_block_t* block) /* in: block */
+{
#ifdef UNIV_SYNC_DEBUG
ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
|| rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
@@ -515,15 +512,16 @@ void
buf_block_buf_fix_inc_debug(
/*========================*/
buf_block_t* block, /* in: block to bufferfix */
- char* file, /* in: file name */
- ulint line) /* in: line */
+ char* file __attribute__ ((unused)), /* in: file name */
+ ulint line __attribute__ ((unused))) /* in: line */
{
+#ifdef UNIV_SYNC_DEBUG
ibool ret;
-
+
ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);
ut_ad(ret == TRUE);
-
+#endif
block->buf_fix_count++;
}
#else /* UNIV_SYNC_DEBUG */
@@ -562,6 +560,8 @@ buf_page_hash_get(
HASH_SEARCH(hash, buf_pool->page_hash, fold, block,
(block->space == space) && (block->offset == offset));
+ ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE);
+
return(block);
}
@@ -629,8 +629,8 @@ buf_page_release(
mutex_enter_fast(&(buf_pool->mutex));
- ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
- ut_ad(block->buf_fix_count > 0);
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->buf_fix_count > 0);
if (rw_latch == RW_X_LATCH && mtr->modifications) {
diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h
index eb9d43d3b93..69a376f8cab 100644
--- a/innobase/include/buf0lru.h
+++ b/innobase/include/buf0lru.h
@@ -37,6 +37,16 @@ These are low-level functions
#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA)
/**********************************************************************
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. A PROBLEM: if readahead is being started,
+what guarantees that it will not try to read in pages after this operation has
+completed? */
+
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+ ulint id); /* in: space id */
+/**********************************************************************
Gets the minimum LRU_position field for the blocks in an initial segment
(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not
guaranteed to be precise, because the ulint_clock may wrap around. */
@@ -67,7 +77,9 @@ LRU list to the free list. */
buf_block_t*
buf_LRU_get_free_block(void);
/*=========================*/
- /* out: the free control block */
+ /* out: the free control block; also if AWE is
+ used, it is guaranteed that the block has its
+ page mapped to a frame when we return */
/**********************************************************************
Puts a block back to the free list. */
diff --git a/innobase/include/buf0rea.h b/innobase/include/buf0rea.h
index aed965a6b21..380a42f4b80 100644
--- a/innobase/include/buf0rea.h
+++ b/innobase/include/buf0rea.h
@@ -59,7 +59,7 @@ buf_read_ahead_linear(
must want access to this page (see NOTE 3 above) */
/************************************************************************
Issues read requests for pages which the ibuf module wants to read in, in
-order to contract insert buffer trees. Technically, this function is like
+order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
void
@@ -68,9 +68,14 @@ buf_read_ibuf_merge_pages(
ibool sync, /* in: TRUE if the caller wants this function
to wait for the highest address page to get
read in, before this function returns */
- ulint space, /* in: space id */
- ulint* page_nos, /* in: array of page numbers to read, with
- the highest page number last in the array */
+ ulint* space_ids, /* in: array of space ids */
+ ib_longlong* space_versions,/* in: the spaces must have this version
+ number (timestamp), otherwise we discard the
+ read; we use this to cancel reads if
+ DISCARD + IMPORT may have changed the
+ tablespace size */
+ ulint* page_nos, /* in: array of page numbers to read, with the
+ highest page number the last in the array */
ulint n_stored); /* in: number of page numbers in the array */
/************************************************************************
Issues read requests for pages which recovery wants to read in. */
diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h
index 4da686bf2e1..2b27ead5fac 100644
--- a/innobase/include/data0type.h
+++ b/innobase/include/data0type.h
@@ -11,6 +11,9 @@ Created 1/16/1996 Heikki Tuuri
#include "univ.i"
+extern ulint data_mysql_default_charset_coll;
+extern ulint data_mysql_latin1_swedish_charset_coll;
+
/* SQL data type struct */
typedef struct dtype_struct dtype_t;
@@ -18,31 +21,79 @@ typedef struct dtype_struct dtype_t;
data type */
extern dtype_t* dtype_binary;
-/* Data main types of SQL data */
-#define DATA_VARCHAR 1 /* character varying */
-#define DATA_CHAR 2 /* fixed length character */
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_VARCHAR 1 /* character varying of the
+ latin1_swedish_ci charset-collation */
+#define DATA_CHAR 2 /* fixed length character of the
+ latin1_swedish_ci charset-collation */
#define DATA_FIXBINARY 3 /* binary string of fixed length */
#define DATA_BINARY 4 /* binary string */
-#define DATA_BLOB 5 /* binary large object, or a TEXT type; if
- prtype & DATA_NONLATIN1 != 0 the data must
- be compared by MySQL as a whole field; if
- prtype & DATA_BINARY_TYPE == 0, then this is
- actually a TEXT column */
+#define DATA_BLOB 5 /* binary large object, or a TEXT type;
+ if prtype & DATA_BINARY_TYPE == 0, then this is
+ actually a TEXT column (or a BLOB created
+ with < 4.0.14) */
#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
#define DATA_SYS 8 /* system column */
+
/* Data types >= DATA_FLOAT must be compared using the whole field, not as
binary strings */
+
#define DATA_FLOAT 9
#define DATA_DOUBLE 10
#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
-#define DATA_VARMYSQL 12 /* non-latin1 varying length char */
-#define DATA_MYSQL 13 /* non-latin1 fixed length char */
+#define DATA_VARMYSQL 12 /* any charset varying length char */
+#define DATA_MYSQL 13 /* any charset fixed length char */
+ /* NOTE that 4.1.1 used DATA_MYSQL and
+ DATA_VARMYSQL for all character sets, and the
+ charset-collation for tables created with it
+ can also be latin1_swedish_ci */
#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size()
requires the values are <= 63 */
/*-------------------------------------------*/
-/* In the lowest byte in the precise type we store the MySQL type code
-(not applicable for system columns). */
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
#define DATA_ENGLISH 4 /* English language character string: this
is a relic from pre-MySQL time and only used
@@ -69,7 +120,7 @@ be less than 256 */
#define DATA_MIX_ID_LEN 9 /* maximum stored length for mix id (in a
compressed dulint form) */
#define DATA_N_SYS_COLS 4 /* number of system columns defined above */
-/*-------------------------------------------*/
+
/* Flags ORed to the precise data type */
#define DATA_NOT_NULL 256 /* this is ORed to the precise type when
the column is declared as NOT NULL */
@@ -79,18 +130,53 @@ be less than 256 */
string, this is ORed to the precise type:
this only holds for tables created with
>= MySQL-4.0.14 */
-#define DATA_NONLATIN1 2048 /* if the data type is a DATA_BLOB (actually
- TEXT) of a non-latin1 type, this is ORed to
- the precise type: this only holds for tables
- created with >= MySQL-4.0.14 */
+/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1.
+ In earlier versions this was set for some
+ BLOB columns.
+*/
/*-------------------------------------------*/
/* This many bytes we need to store the type information affecting the
alphabetical order for a single field and decide the storage size of an
SQL null*/
-#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
/*************************************************************************
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type. */
+
+ibool
+dtype_is_string_type(
+/*=================*/
+ /* out: TRUE if string type */
+ ulint mtype); /* in: InnoDB main data type code: DATA_CHAR, ... */
+/*************************************************************************
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE. */
+
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+ /* out: TRUE if binary string type */
+ ulint mtype, /* in: main data type */
+ ulint prtype);/* in: precise type */
+/*************************************************************************
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE. */
+
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+ /* out: TRUE if non-binary string type */
+ ulint mtype, /* in: main data type */
+ ulint prtype);/* in: precise type */
+/*************************************************************************
Sets a data type structure. */
UNIV_INLINE
void
@@ -124,6 +210,23 @@ dtype_get_prtype(
/*=============*/
dtype_t* type);
/*************************************************************************
+Gets the MySQL charset-collation code for MySQL string types. */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype);/* in: precise data type */
+/*************************************************************************
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code. */
+
+ulint
+dtype_form_prtype(
+/*==============*/
+ ulint old_prtype, /* in: the MySQL type code and the flags
+ DATA_BINARY_TYPE etc. */
+ ulint charset_coll); /* in: MySQL charset-collation code */
+/*************************************************************************
Gets the type length. */
UNIV_INLINE
ulint
@@ -172,24 +275,36 @@ dtype_is_fixed_size(
/* out: TRUE if fixed size */
dtype_t* type); /* in: type */
/**************************************************************************
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /* in: type struct */
+ byte* buf); /* in: buffer for the stored order info */
+/**************************************************************************
Stores for a type the information which determines its alphabetical ordering
-and the storage size of an SQL NULL value. */
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
UNIV_INLINE
void
-dtype_store_for_order_and_null_size(
-/*================================*/
- byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /* in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
bytes where we store the info */
dtype_t* type); /* in: type struct */
/**************************************************************************
Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. */
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
UNIV_INLINE
void
-dtype_read_for_order_and_null_size(
-/*===============================*/
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
dtype_t* type, /* in: type struct */
- byte* buf); /* in: buffer for the stored order info */
+ byte* buf); /* in: buffer for stored type order info */
/*************************************************************************
Validates a data type structure. */
@@ -212,7 +327,7 @@ struct dtype_struct{
ulint mtype; /* main data type */
ulint prtype; /* precise type; MySQL data type */
- /* remaining two fields do not affect alphabetical ordering: */
+ /* the remaining two fields do not affect alphabetical ordering: */
ulint len; /* length */
ulint prec; /* precision */
diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic
index dbc5b6615f6..946b646ffbf 100644
--- a/innobase/include/data0type.ic
+++ b/innobase/include/data0type.ic
@@ -72,6 +72,17 @@ dtype_get_prtype(
}
/*************************************************************************
+Gets the MySQL charset-collation code for MySQL string types. */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype) /* in: precise data type */
+{
+ return((prtype >> 16) & 0xFFUL);
+}
+
+/*************************************************************************
Gets the type length. */
UNIV_INLINE
ulint
@@ -127,35 +138,44 @@ dtype_get_pad_char(
/**************************************************************************
Stores for a type the information which determines its alphabetical ordering
-and the storage size of an SQL NULL value. */
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
UNIV_INLINE
void
-dtype_store_for_order_and_null_size(
-/*================================*/
- byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /* in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
bytes where we store the info */
dtype_t* type) /* in: type struct */
{
- ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
- buf[0] = (byte)(type->mtype & 0xFF);
+ buf[0] = (byte)(type->mtype & 0xFFUL);
if (type->prtype & DATA_BINARY_TYPE) {
buf[0] = buf[0] | 128;
}
- if (type->prtype & DATA_NONLATIN1) {
- buf[0] = buf[0] | 64;
- }
+ /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
+ buf[0] = buf[0] | 64;
+ }
+ */
+
+ buf[1] = (byte)(type->prtype & 0xFFUL);
- buf[1] = (byte)(type->prtype & 0xFF);
+ mach_write_to_2(buf + 2, type->len & 0xFFFFUL);
- mach_write_to_2(buf + 2, type->len & 0xFFFF);
+ mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+ /* Note that the second last byte is left unused, because the
+ charset-collation code is always < 256 */
}
/**************************************************************************
Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. */
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
UNIV_INLINE
void
dtype_read_for_order_and_null_size(
@@ -172,12 +192,56 @@ dtype_read_for_order_and_null_size(
type->prtype = type->prtype | DATA_BINARY_TYPE;
}
- if (buf[0] & 64) {
- type->prtype = type->prtype | DATA_NONLATIN1;
+ type->len = mach_read_from_2(buf + 2);
+
+ type->prtype = dtype_form_prtype(type->prtype,
+ data_mysql_default_charset_coll);
+}
+
+/**************************************************************************
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /* in: type struct */
+ byte* buf) /* in: buffer for stored type order info */
+{
+ ulint charset_coll;
+
+ ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype = type->prtype | DATA_BINARY_TYPE;
}
type->len = mach_read_from_2(buf + 2);
-}
+
+ mach_read_from_2(buf + 4);
+
+ charset_coll = mach_read_from_2(buf + 4);
+
+ if (dtype_is_string_type(type->mtype)) {
+ ut_a(charset_coll < 256);
+
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted with MySQL
+ version < 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+
+ charset_coll = data_mysql_default_charset_coll;
+ }
+
+ type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+ }
+}
/***************************************************************************
Returns the size of a fixed size data type, 0 if not a fixed size type. */
diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h
index 854b9794c00..be7667bfd0c 100644
--- a/innobase/include/db0err.h
+++ b/innobase/include/db0err.h
@@ -48,6 +48,11 @@ Created 5/24/1996 Heikki Tuuri
from a table failed */
#define DB_NO_SAVEPOINT 42 /* no savepoint exists with the given
name */
+#define DB_TABLESPACE_ALREADY_EXISTS 43 /* we cannot create a new single-table
+ tablespace because a file of the same
+ name already exists */
+#define DB_TABLESPACE_DELETED 44 /* tablespace does not exist or is
+ being dropped right now */
/* The following are partial failure codes */
#define DB_FAIL 1000
diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h
index cb631be7e35..35eff5af29a 100644
--- a/innobase/include/dict0boot.h
+++ b/innobase/include/dict0boot.h
@@ -93,7 +93,7 @@ dict_create(void);
indexes; ibuf tables and indexes are
assigned as the id the number
DICT_IBUF_ID_MIN plus the space id */
-#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFF, 0)
+#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0)
/* The offset of the dictionary header on the page */
#define DICT_HDR FSEG_PAGE_DATA
diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h
index be5d3b5b465..534c9e380b8 100644
--- a/innobase/include/dict0dict.h
+++ b/innobase/include/dict0dict.h
@@ -59,6 +59,16 @@ Inits the data dictionary module. */
void
dict_init(void);
/*===========*/
+/************************************************************************
+Gets the space id of every table of the data dictionary and makes a linear
+list and a hash table of them to the data dictionary cache. This function
+can be called at database startup if we did not need to do a crash recovery.
+In crash recovery we must scan the space id's from the .ibd files in MySQL
+database directories. */
+
+void
+dict_load_space_id_list(void);
+/*=========================*/
/**************************************************************************
Returns a stored procedure object and memoryfixes it. */
UNIV_INLINE
@@ -195,6 +205,15 @@ dict_table_rename_in_cache(
to preserve the original table name
in constraints which reference it */
/**************************************************************************
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /* in: table object already in cache */
+ dulint new_id);/* in: new id to set */
+/**************************************************************************
Adds a foreign key constraint object to the dictionary cache. May free
the object if there already is an object with the same identifier in.
At least one of foreign table or referenced table must already be in
@@ -744,7 +763,8 @@ dict_tree_build_node_ptr(
/*=====================*/
/* out, own: node pointer */
dict_tree_t* tree, /* in: index tree */
- rec_t* rec, /* in: record for which to build node pointer */
+ rec_t* rec, /* in: record for which to build node
+ pointer */
ulint page_no,/* in: page number to put in node pointer */
mem_heap_t* heap, /* in: memory heap where pointer created */
ulint level); /* in: level of rec in tree: 0 means leaf
@@ -912,7 +932,7 @@ struct dict_sys_struct{
dict_table_t* sys_columns; /* SYS_COLUMNS table */
dict_table_t* sys_indexes; /* SYS_INDEXES table */
dict_table_t* sys_fields; /* SYS_FIELDS table */
-};
+};
#ifndef UNIV_NONINL
#include "dict0dict.ic"
diff --git a/innobase/include/dict0load.h b/innobase/include/dict0load.h
index b60996a8dab..f7168a0f45f 100644
--- a/innobase/include/dict0load.h
+++ b/innobase/include/dict0load.h
@@ -15,6 +15,17 @@ Created 4/24/1996 Heikki Tuuri
#include "ut0byte.h"
/************************************************************************
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+In a normal startup we just scan the biggest space id, and store it to
+fil_system. */
+
+void
+dict_check_tablespaces_or_store_max_id(
+/*===================================*/
+ ibool in_crash_recovery); /* in: are we doing a crash recovery */
+/************************************************************************
Finds the first table name in the given database. */
char*
@@ -32,7 +43,10 @@ a foreign key references columns in this table. */
dict_table_t*
dict_load_table(
/*============*/
- /* out: table, NULL if does not exist */
+ /* out: table, NULL if does not exist; if the table is
+ stored in an .ibd file, but the file does not exist,
+ then we set the ibd_file_missing flag TRUE in the table
+ object we return */
char* name); /* in: table name */
/***************************************************************************
Loads a table object based on the table id. */
diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h
index 1930825f601..23753df4079 100644
--- a/innobase/include/dict0mem.h
+++ b/innobase/include/dict0mem.h
@@ -310,6 +310,13 @@ struct dict_table_struct{
char* name; /* table name */
ulint space; /* space where the clustered index of the
table is placed */
+ ibool ibd_file_missing;/* TRUE if this is in a single-table
+ tablespace and the .ibd file is missing; then
+ we must return in ha_innodb.cc an error if the
+ user tries to query such an orphaned table */
+ ibool tablespace_discarded;/* this flag is set TRUE when the
+ user calls DISCARD TABLESPACE on this table,
+ and reset to FALSE in IMPORT TABLESPACE */
hash_node_t name_hash; /* hash chain node */
hash_node_t id_hash; /* hash chain node */
ulint n_def; /* number of columns defined so far */
diff --git a/innobase/include/dyn0dyn.ic b/innobase/include/dyn0dyn.ic
index 787615cae09..b6c4808398b 100644
--- a/innobase/include/dyn0dyn.ic
+++ b/innobase/include/dyn0dyn.ic
@@ -7,7 +7,7 @@ Created 2/5/1996 Heikki Tuuri
*******************************************************/
#define DYN_BLOCK_MAGIC_N 375767
-#define DYN_BLOCK_FULL_FLAG 0x1000000
+#define DYN_BLOCK_FULL_FLAG 0x1000000UL
/****************************************************************
Adds a new block to a dyn array. */
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
index ad3149f0b36..310336af38e 100644
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@@ -16,6 +16,14 @@ Created 10/25/1995 Heikki Tuuri
#include "ut0byte.h"
#include "os0file.h"
+/* When mysqld is run, the default directory "." is the mysqld datadir, but in
+ibbackup we must set it explicitly; the patgh must NOT contain the trailing
+'/' or '\' */
+extern char* fil_path_to_mysql_datadir;
+
+/* Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE 4
+
/* 'null' (undefined) page offset in the context of file spaces */
#define FIL_NULL ULINT32_UNDEFINED
@@ -60,10 +68,8 @@ extern fil_addr_t fil_addr_null;
first page in a data file: the file
has been flushed to disk at least up
to this lsn */
-#define FIL_PAGE_ARCH_LOG_NO 34 /* this is only defined for the
- first page in a data file: the latest
- archived log file number when the
- flush lsn above was written */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /* starting from 4.1.x this
+ contains the space id of the page */
#define FIL_PAGE_DATA 38 /* start of the data on the page */
/* File page trailer */
@@ -86,50 +92,51 @@ extern fil_addr_t fil_addr_null;
extern ulint fil_n_pending_log_flushes;
extern ulint fil_n_pending_tablespace_flushes;
+
/***********************************************************************
-Reserves a right to open a single file. The right must be released with
-fil_release_right_to_open. */
+Returns the version number of a tablespace, -1 if not found. */
-void
-fil_reserve_right_to_open(void);
-/*===========================*/
+ib_longlong
+fil_space_get_version(
+/*==================*/
+ /* out: version number, -1 if the tablespace does not
+ exist in the memory cache */
+ ulint id); /* in: space id */
/***********************************************************************
-Releases a right to open a single file. */
+Returns the latch of a file space. */
-void
-fil_release_right_to_open(void);
-/*===========================*/
-/************************************************************************
-Returns TRUE if file address is undefined. */
-ibool
-fil_addr_is_null(
-/*=============*/
- /* out: TRUE if undefined */
- fil_addr_t addr); /* in: address */
-/********************************************************************
-Initializes the file system of this module. */
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ /* out: latch protecting storage allocation */
+ ulint id); /* in: space id */
+/***********************************************************************
+Returns the type of a file space. */
-void
-fil_init(
-/*=====*/
- ulint max_n_open); /* in: max number of open files */
-/********************************************************************
-Initializes the ibuf indexes at a database start. This can be called
-after the file space headers have been created and the dictionary system
-has been initialized. */
+ulint
+fil_space_get_type(
+/*===============*/
+ /* out: FIL_TABLESPACE or FIL_LOG */
+ ulint id); /* in: space id */
+/***********************************************************************
+Returns the ibuf data of a file space. */
-void
-fil_ibuf_init_at_db_start(void);
-/*===========================*/
+ibuf_data_t*
+fil_space_get_ibuf_data(
+/*====================*/
+ /* out: ibuf data for this space */
+ ulint id); /* in: space id */
/***********************************************************************
-Creates a space object and puts it to the file system. */
+Appends a new file to the chain of files of a space. File must be closed. */
void
-fil_space_create(
-/*=============*/
- char* name, /* in: space name */
- ulint id, /* in: space id */
- ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */
+fil_node_create(
+/*============*/
+ char* name, /* in: file name (file must be closed) */
+ ulint size, /* in: file size in database blocks, rounded downwards
+ to an integer */
+ ulint id, /* in: space id where to append */
+ ibool is_raw);/* in: TRUE if a raw device or a raw disk partition */
/********************************************************************
Drops files from the start of a file space, so that its size is cut by
the amount given. */
@@ -141,48 +148,88 @@ fil_space_truncate_start(
ulint trunc_len); /* in: truncate by this much; it is an error
if this does not equal to the combined size of
some initial files in the space */
-/**************************************************************************
-Tries to extend a data file by the number of pages given. Any fractions of a
-megabyte are ignored. */
+/***********************************************************************
+Creates a space memory object and puts it to the 'fil system' hash table. If
+there is an error, prints an error message to the .err log. */
ibool
-fil_extend_last_data_file(
-/*======================*/
- /* out: TRUE if success, also if we run
- out of disk space we may return TRUE */
- ulint* actual_increase,/* out: number of pages we were able to
- extend, here the orginal size of the file and
- the resulting size of the file are rounded
- downwards to a full megabyte, and the
- difference expressed in pages is returned */
- ulint size_increase); /* in: try to extend this many pages */
+fil_space_create(
+/*=============*/
+ /* out: TRUE if success */
+ char* name, /* in: space name */
+ ulint id, /* in: space id */
+ ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */
/***********************************************************************
-Frees a space object from a file system. Closes the files in the chain
-but does not delete them. */
+Frees a space object from a the tablespace memory cache. Closes the files in
+the chain but does not delete them. */
-void
+ibool
fil_space_free(
/*===========*/
+ /* out: TRUE if success */
ulint id); /* in: space id */
/***********************************************************************
-Returns the latch of a file space. */
-
-rw_lock_t*
-fil_space_get_latch(
-/*================*/
- /* out: latch protecting storage allocation */
- ulint id); /* in: space id */
-/***********************************************************************
-Returns the type of a file space. */
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache. */
ulint
-fil_space_get_type(
+fil_space_get_size(
/*===============*/
- /* out: FIL_TABLESPACE or FIL_LOG */
+ /* out: space size, 0 if space not found */
ulint id); /* in: space id */
+/***********************************************************************
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache. */
+
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ /* out: TRUE if the address is meaningful */
+ ulint id, /* in: space id */
+ ulint page_no);/* in: page number */
+/********************************************************************
+Initializes the tablespace memory cache. */
+
+void
+fil_init(
+/*=====*/
+ ulint max_n_open); /* in: max number of open files */
+/***********************************************************************
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+
+void
+fil_open_log_and_system_tablespace_files(void);
+/*==========================================*/
+/***********************************************************************
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+
+void
+fil_close_all_files(void);
+/*=====================*/
+/***********************************************************************
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id);/* in: maximum known id */
+/********************************************************************
+Initializes the ibuf data structure for space 0 == the system tablespace.
+This can be called after the file space headers have been created and the
+dictionary system has been initialized. */
+
+void
+fil_ibuf_init_at_db_start(void);
+/*===========================*/
/********************************************************************
Writes the flushed lsn and the latest archived log number to the page
-header of the first page of each data file. */
+header of the first page of each data file in the system tablespace. */
ulint
fil_write_flushed_lsn_to_data_files(
@@ -205,48 +252,266 @@ fil_read_flushed_lsn_and_arch_log_no(
dulint* max_flushed_lsn, /* in/out: */
ulint* max_arch_log_no); /* in/out: */
/***********************************************************************
-Returns the ibuf data of a file space. */
+Increments the count of pending insert buffer page merges, if space is not
+being deleted. */
-ibuf_data_t*
-fil_space_get_ibuf_data(
-/*====================*/
- /* out: ibuf data for this space */
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+ /* out: TRUE if being deleted, and ibuf merges should
+ be skipped */
+ ulint id); /* in: space id */
+/***********************************************************************
+Decrements the count of pending insert buffer page merges. */
+
+void
+fil_decr_pending_ibuf_merges(
+/*========================*/
+ ulint id); /* in: space id */
+/***********************************************************************
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations. */
+
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+ /* out: end of log record, or NULL if the
+ record was not completely contained between
+ ptr and end_ptr */
+ byte* ptr, /* in: buffer containing the log record body,
+ or an initial segment of it, if the record does
+ not fir completely between ptr and end_ptr */
+ byte* end_ptr, /* in: buffer end */
+ ulint type, /* in: the type of this log record */
+ ibool do_replay, /* in: TRUE if we want to replay the
+ operation, and not just parse the log record */
+ ulint space_id); /* in: if do_replay is TRUE, the space id of
+ the tablespace in question; otherwise
+ ignored */
+/***********************************************************************
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache. */
+
+ibool
+fil_delete_tablespace(
+/*==================*/
+ /* out: TRUE if success */
+ ulint id); /* in: space id */
+/***********************************************************************
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had. */
+
+ibool
+fil_discard_tablespace(
+/*===================*/
+ /* out: TRUE if success */
ulint id); /* in: space id */
/***********************************************************************
-Returns the size of the space in pages. */
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache. */
+
+ibool
+fil_rename_tablespace(
+/*==================*/
+ /* out: TRUE if success */
+ char* old_name, /* in: old table name in the standard
+ databasename/tablename format of InnoDB, or
+ NULL if we do the rename based on the space
+ id only */
+ ulint id, /* in: space id */
+ char* new_name); /* in: new table name in the standard
+ databasename/tablename format of InnoDB */
+/***********************************************************************
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. */
ulint
-fil_space_get_size(
-/*===============*/
- /* out: space size */
+fil_create_new_single_table_tablespace(
+/*===================================*/
+ /* out: DB_SUCCESS or error code */
+ ulint* space_id, /* in/out: space id; if this is != 0, then
+ this is an input parameter, otherwise
+ output */
+ char* tablename, /* in: the table name in the usual
+ databasename/tablename format of InnoDB */
+ ulint size); /* in: the initial size of the tablespace file
+ in pages, must be > 0 */
+/************************************************************************
+Tries to open a single-table tablespace and checks the space id is right in
+it. If does not succeed, prints an error message to the .err log. This
+function is used to open the tablespace when we load a table definition
+to the dictionary cache. NOTE that we assume this operation is used under the
+protection of the dictionary mutex, so that two users cannot race here. */
+
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+ /* out: TRUE if success */
+ ulint id, /* in: space id */
+ char* name); /* in: table name in the databasename/tablename
+ format */
+/************************************************************************
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn. */
+
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+ /* out: TRUE if success */
+ char* name, /* in: table name in the databasename/tablename
+ format */
+ dulint current_lsn); /* in: reset lsn's if the lsn stamped to
+ FIL_PAGE_FILE_FLUSH_LSN in the first page is
+ too high */
+/************************************************************************
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0. */
+
+ulint
+fil_load_single_table_tablespaces(void);
+/*===================================*/
+ /* out: DB_SUCCESS or error number */
+/************************************************************************
+If we need crash recovery, and we have called
+fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(),
+we can call this function to print an error message of orphaned .ibd files
+for which there is not a data dictionary entry with a matching table name
+and space id. */
+
+void
+fil_print_orphaned_tablespaces(void);
+/*================================*/
+/***********************************************************************
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there. */
+
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+ /* out: TRUE if does not exist or is being\
+ deleted */
+ ulint id, /* in: space id */
+ ib_longlong version);/* in: tablespace_version should be this; if
+ you pass -1 as the value of this, then this
+ parameter is ignored */
+/***********************************************************************
+Returns TRUE if a single-table tablespace exists in the memory cache. */
+
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+ /* out: TRUE if exists */
ulint id); /* in: space id */
/***********************************************************************
-Checks if the pair space, page_no refers to an existing page in a
-tablespace file space. */
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache. */
ibool
-fil_check_adress_in_tablespace(
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+ /* out: TRUE if a matching tablespace
+ exists in the memory cache */
+ ulint id, /* in: space id */
+ char* name, /* in: table name in the standard
+ 'databasename/tablename' format */
+ ibool mark_space, /* in: in crash recovery, at database startup
+ we mark all spaces which have an associated
+ table in the InnoDB data dictionary, so that
+ we can print a warning about orphaned
+ tablespaces */
+ ibool print_error_if_does_not_exist);
+ /* in: print detailed error information to
+ the .err log if a matching tablespace is
+ not found from memory */
+/**************************************************************************
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing. */
+
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+ /* out: TRUE if success */
+ ulint* actual_size, /* out: size of the space after extension;
+ if we ran out of disk space this may be lower
+ than the desired size */
+ ulint space_id, /* in: space id, must be != 0 */
+ ulint size_after_extend);/* in: desired size in pages after the
+ extension; if the current space size is bigger
+ than this already, the function does nothing */
+#ifdef UNIV_HOTBACKUP
+/************************************************************************
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be appllied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+
+void
+fil_extend_tablespaces_to_stored_len(void);
+/*======================================*/
+#endif
+/***********************************************************************
+Tries to reserve free extents in a file space. */
+
+ibool
+fil_space_reserve_free_extents(
/*===========================*/
- /* out: TRUE if the address is meaningful */
- ulint id, /* in: space id */
- ulint page_no);/* in: page number */
+ /* out: TRUE if succeed */
+ ulint id, /* in: space id */
+ ulint n_free_now, /* in: number of free extents now */
+ ulint n_to_reserve); /* in: how many one wants to reserve */
/***********************************************************************
-Appends a new file to the chain of files of a space.
-File must be closed. */
+Releases free extents in a file space. */
void
-fil_node_create(
-/*============*/
- char* name, /* in: file name (file must be closed) */
- ulint size, /* in: file size in database blocks, rounded downwards
- to an integer */
- ulint id); /* in: space id where to append */
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /* in: space id */
+ ulint n_reserved); /* in: how many one reserved */
+/***********************************************************************
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+ ulint id); /* in: space id */
/************************************************************************
Reads or writes data. This operation is asynchronous (aio). */
-void
+ulint
fil_io(
/*===*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE,
ORed to OS_FILE_LOG, if a log i/o
and ORed to OS_AIO_SIMULATED_WAKE_LATER
@@ -262,9 +527,9 @@ fil_io(
ulint byte_offset, /* in: remainder of offset in bytes; in
aio this must be divisible by the OS block
size */
- ulint len, /* in: how many bytes to read; this must
- not cross a file boundary; in aio this must
- be a block size multiple */
+ ulint len, /* in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
void* buf, /* in/out: buffer where to store read data
or from where to write; in aio this must be
appropriately aligned */
@@ -272,12 +537,15 @@ fil_io(
aio used, else ignored */
/************************************************************************
Reads data from a space to a buffer. Remember that the possible incomplete
-blocks at the end of a file are ignored: they are not taken into account when
+blocks at the end of file are ignored: they are not taken into account when
calculating the byte offset within a space. */
-void
+ulint
fil_read(
/*=====*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint space_id, /* in: space id */
ulint block_offset, /* in: offset in number of blocks */
@@ -292,12 +560,15 @@ fil_read(
aio used, else ignored */
/************************************************************************
Writes data to a space from a buffer. Remember that the possible incomplete
-blocks at the end of a file are ignored: they are not taken into account when
+blocks at the end of file are ignored: they are not taken into account when
calculating the byte offset within a space. */
-void
+ulint
fil_write(
/*======*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+ which does not exist */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint space_id, /* in: space id */
ulint block_offset, /* in: offset in number of blocks */
@@ -322,7 +593,8 @@ fil_aio_wait(
ulint segment); /* in: the number of the segment in the aio
array to wait for */
/**************************************************************************
-Flushes to disk possible writes cached by the OS. */
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
void
fil_flush(
@@ -338,13 +610,21 @@ fil_flush_file_spaces(
/*==================*/
ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */
/**********************************************************************
-Checks the consistency of the file system. */
+Checks the consistency of the tablespace cache. */
ibool
fil_validate(void);
/*==============*/
/* out: TRUE if ok */
/************************************************************************
+Returns TRUE if file address is undefined. */
+
+ibool
+fil_addr_is_null(
+/*=============*/
+ /* out: TRUE if undefined */
+ fil_addr_t addr); /* in: address */
+/************************************************************************
Accessor functions for a file page */
ulint
@@ -368,32 +648,7 @@ fil_page_get_type(
/* out: type; NOTE that if the type has not been
written to page, the return value not defined */
byte* page); /* in: file page */
-/***********************************************************************
-Tries to reserve free extents in a file space. */
-ibool
-fil_space_reserve_free_extents(
-/*===========================*/
- /* out: TRUE if succeed */
- ulint id, /* in: space id */
- ulint n_free_now, /* in: number of free extents now */
- ulint n_to_reserve); /* in: how many one wants to reserve */
-/***********************************************************************
-Releases free extents in a file space. */
-
-void
-fil_space_release_free_extents(
-/*===========================*/
- ulint id, /* in: space id */
- ulint n_reserved); /* in: how many one reserved */
-/***********************************************************************
-Gets the number of reserved extents. If the database is silent, this number
-should be zero. */
-
-ulint
-fil_space_get_n_reserved_extents(
-/*=============================*/
- ulint id); /* in: space id */
typedef struct fil_space_struct fil_space_t;
diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h
index 3494f336b1e..2fcde882df7 100644
--- a/innobase/include/fsp0fsp.h
+++ b/innobase/include/fsp0fsp.h
@@ -55,7 +55,7 @@ ulint
fsp_header_get_free_limit(
/*======================*/
/* out: free limit in megabytes */
- ulint space); /* in: space id */
+ ulint space); /* in: space id, must be 0 */
/**************************************************************************
Gets the size of the tablespace from the tablespace header. If we do not
have an auto-extending data file, this should be equal to the size of the
@@ -65,9 +65,35 @@ ulint
fsp_header_get_tablespace_size(
/*===========================*/
/* out: size in pages */
- ulint space); /* in: space id */
+ ulint space); /* in: space id, must be 0 */
/**************************************************************************
-Initializes the space header of a new created space. */
+Reads the file space size stored in the header page. */
+
+ulint
+fsp_get_size_low(
+/*=============*/
+ /* out: tablespace size stored in the space header */
+ page_t* page); /* in: header page (page 0 in the tablespace) */
+/**************************************************************************
+Reads the space id from the first page of a tablespace. */
+
+ulint
+fsp_header_get_space_id(
+/*====================*/
+ /* out: space id, ULINT UNDEFINED if error */
+ page_t* page); /* in: first page of a tablespace */
+/**************************************************************************
+Writes the space id to a tablespace header. This function is used past the
+buffer pool when we in fil0fil.c create a new single-table tablespace. */
+
+void
+fsp_header_write_space_id(
+/*======================*/
+ page_t* page, /* in: first page in the space */
+ ulint space_id); /* in: space id */
+/**************************************************************************
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
void
fsp_header_init(
@@ -117,12 +143,12 @@ fseg_create_general(
will belong to the created segment */
ulint byte_offset, /* in: byte offset of the created segment header
on the page */
- ibool has_done_reservation, /* in: TRUE if the caller has
- already done the reservation for the pages
- with fsp_reserve_free_extents (at least 2 extents:
- one for the inode and, then there other for the
- segment) is no need to do the check for this
- individual operation */
+ ibool has_done_reservation, /* in: TRUE if the caller has already
+ done the reservation for the pages with
+ fsp_reserve_free_extents (at least 2 extents: one for
+ the inode and the other for the segment) then there is
+ no need to do the check for this individual
+ operation */
mtr_t* mtr); /* in: mtr */
/**************************************************************************
Calculates the number of pages reserved by a segment, and how many pages are
@@ -194,12 +220,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations
will not succeed, but the latter two allocations will succeed, if possible.
The purpose is to avoid dead end where the database is full but the
user cannot free any space because these freeing operations temporarily
-reserve some space. */
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available. */
ibool
fsp_reserve_free_extents(
/*=====================*/
/* out: TRUE if we were able to make the reservation */
+ ulint* n_reserved,/* out: number of extents actually reserved; if we
+ return TRUE and the tablespace size is < 64 pages,
+ then this can be 0, otherwise it is n_ext */
ulint space, /* in: space id */
ulint n_ext, /* in: number of extents to reserve */
ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
@@ -337,8 +372,8 @@ pages: */
#define FSP_FIRST_INODE_PAGE_NO 2
#define FSP_IBUF_HEADER_PAGE_NO 3
#define FSP_IBUF_TREE_ROOT_PAGE_NO 4
- /* The ibuf tree root page number in each
- tablespace; its fseg inode is on the page
+ /* The ibuf tree root page number in
+ tablespace 0; its fseg inode is on the page
number FSP_FIRST_INODE_PAGE_NO */
#define FSP_TRX_SYS_PAGE_NO 5
#define FSP_FIRST_RSEG_PAGE_NO 6
diff --git a/innobase/include/fut0lst.ic b/innobase/include/fut0lst.ic
index d2e79cf7640..c0d61833b48 100644
--- a/innobase/include/fut0lst.ic
+++ b/innobase/include/fut0lst.ic
@@ -23,7 +23,7 @@ Created 11/28/1995 Heikki Tuuri
#define FLST_FIRST 4 /* 6-byte address of the first element
of the list; undefined if empty list */
#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
- first element of the list; undefined
+ last element of the list; undefined
if empty list */
/************************************************************************
diff --git a/innobase/include/ha0ha.h b/innobase/include/ha0ha.h
index 0beac928b7e..c3fc04b47bb 100644
--- a/innobase/include/ha0ha.h
+++ b/innobase/include/ha0ha.h
@@ -28,7 +28,7 @@ ha_search_and_get_data(
/*************************************************************
Looks for an element when we know the pointer to the data and updates
the pointer to data if found. */
-UNIV_INLINE
+
void
ha_search_and_update_if_found(
/*==========================*/
diff --git a/innobase/include/ha0ha.ic b/innobase/include/ha0ha.ic
index f6faf84b9f5..5369ca7f273 100644
--- a/innobase/include/ha0ha.ic
+++ b/innobase/include/ha0ha.ic
@@ -49,11 +49,8 @@ ha_node_t*
ha_chain_get_next(
/*==============*/
/* out: next node, NULL if none */
- hash_table_t* table __attribute__((unused)), /* in: hash table */
ha_node_t* node) /* in: hash chain node */
{
- ut_ad(table);
-
return(node->next);
}
@@ -96,7 +93,7 @@ ha_search(
return(node);
}
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
return(NULL);
@@ -128,7 +125,7 @@ ha_search_and_get_data(
return(node->data);
}
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
return(NULL);
@@ -143,18 +140,13 @@ ha_next(
/* out: pointer to the next hash table node
in chain with the fold value, NULL if not
found */
- hash_table_t* table, /* in: hash table */
ha_node_t* node) /* in: hash table node */
{
ulint fold;
fold = node->fold;
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
-#endif /* UNIV_SYNC_DEBUG */
-
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
while (node) {
if (node->fold == fold) {
@@ -162,7 +154,7 @@ ha_next(
return(node);
}
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
return(NULL);
@@ -194,38 +186,13 @@ ha_search_with_data(
return(node);
}
- node = ha_chain_get_next(table, node);
+ node = ha_chain_get_next(node);
}
return(NULL);
}
/*************************************************************
-Looks for an element when we know the pointer to the data, and updates
-the pointer to data, if found. */
-UNIV_INLINE
-void
-ha_search_and_update_if_found(
-/*==========================*/
- hash_table_t* table, /* in: hash table */
- ulint fold, /* in: folded value of the searched data */
- void* data, /* in: pointer to the data */
- void* new_data)/* in: new pointer to the data */
-{
- ha_node_t* node;
-
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
-#endif /* UNIV_SYNC_DEBUG */
-
- node = ha_search_with_data(table, fold, data);
-
- if (node) {
- node->data = new_data;
- }
-}
-
-/*************************************************************
Looks for an element when we know the pointer to the data, and deletes
it from the hash table, if found. */
UNIV_INLINE
diff --git a/innobase/include/hash0hash.h b/innobase/include/hash0hash.h
index d325636f511..79efe016324 100644
--- a/innobase/include/hash0hash.h
+++ b/innobase/include/hash0hash.h
@@ -109,7 +109,7 @@ do {\
\
while (struct3333->NAME != DATA) {\
\
- ut_ad(struct3333);\
+ ut_a(struct3333);\
struct3333 = struct3333->NAME;\
}\
\
@@ -290,6 +290,8 @@ struct hash_cell_struct{
/* The hash table structure */
struct hash_table_struct {
+ ibool adaptive;/* TRUE if this is the hash table of the
+ adaptive hash index */
ulint n_cells;/* number of cells in the hash table */
hash_cell_t* array; /* pointer to cell array */
ulint n_mutexes;/* if mutexes != NULL, then the number of
diff --git a/innobase/include/ibuf0ibuf.h b/innobase/include/ibuf0ibuf.h
index a64eb53bd19..8ef67df26f8 100644
--- a/innobase/include/ibuf0ibuf.h
+++ b/innobase/include/ibuf0ibuf.h
@@ -40,6 +40,13 @@ void
ibuf_init_at_db_start(void);
/*=======================*/
/*************************************************************************
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/*************************************************************************
Initializes an ibuf bitmap page. */
void
@@ -198,8 +205,8 @@ When an index page is read from a disk to the buffer pool, this function
inserts to the page the possible index entries buffered in the insert buffer.
The entries are deleted from the insert buffer. If the page is not read, but
created in the buffer pool, this function deletes its buffered entries from
-the insert buffer; note that there can exist entries if the page belonged to
-an index which was dropped. */
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
void
ibuf_merge_or_delete_for_page(
@@ -207,7 +214,21 @@ ibuf_merge_or_delete_for_page(
page_t* page, /* in: if page has been read from disk, pointer to
the page x-latched, else NULL */
ulint space, /* in: space id of the index page */
- ulint page_no);/* in: page number of the index page */
+ ulint page_no,/* in: page number of the index page */
+ ibool update_ibuf_bitmap);/* in: normally this is set to TRUE, but if
+ we have deleted or are deleting the tablespace, then we
+ naturally do not want to update a non-existent bitmap
+ page */
+/*************************************************************************
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+ ulint space); /* in: space id */
/*************************************************************************
Contracts insert buffer trees by reading pages to the buffer pool. */
@@ -257,6 +278,13 @@ ibuf_count_get(
ulint space, /* in: space id */
ulint page_no);/* in: page number */
/**********************************************************************
+Looks if the insert buffer is empty. */
+
+ibool
+ibuf_is_empty(void);
+/*===============*/
+ /* out: TRUE if empty */
+/**********************************************************************
Prints info of ibuf. */
void
diff --git a/innobase/include/ibuf0ibuf.ic b/innobase/include/ibuf0ibuf.ic
index 0886c8c02cc..68f7ce9c1d0 100644
--- a/innobase/include/ibuf0ibuf.ic
+++ b/innobase/include/ibuf0ibuf.ic
@@ -218,7 +218,7 @@ ibuf_update_free_bits_if_full(
}
if (after == 0) {
- /* We move the page to front of the buffer pool LRU list:
+ /* We move the page to the front of the buffer pool LRU list:
the purpose of this is to prevent those pages to which we
cannot make inserts using the insert buffer from slipping
out of the buffer pool */
diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h
index 0fd1696b882..103d28cd130 100644
--- a/innobase/include/lock0lock.h
+++ b/innobase/include/lock0lock.h
@@ -526,12 +526,12 @@ extern lock_sys_t* lock_sys;
#define LOCK_X 5 /* exclusive */
#define LOCK_AUTO_INC 6 /* locks the auto-inc counter of a table
in an exclusive mode */
-#define LOCK_MODE_MASK 0xF /* mask used to extract mode from the
+#define LOCK_MODE_MASK 0xFUL /* mask used to extract mode from the
type_mode field in a lock */
/* Lock types */
#define LOCK_TABLE 16 /* these type values should be so high that */
#define LOCK_REC 32 /* they can be ORed to the lock mode */
-#define LOCK_TYPE_MASK 0xF0 /* mask used to extract lock type from the
+#define LOCK_TYPE_MASK 0xF0UL /* mask used to extract lock type from the
type_mode field in a lock */
/* Waiting lock flag */
#define LOCK_WAIT 256 /* this wait bit should be so high that
diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h
index 24ec28a56e6..dc44429d636 100644
--- a/innobase/include/log0log.h
+++ b/innobase/include/log0log.h
@@ -519,9 +519,9 @@ Peeks the current lsn. */
ibool
log_peek_lsn(
/*=========*/
- /* out: TRUE if success, FALSE if could not get the
- log system mutex */
- dulint* lsn); /* out: if returns TRUE, current lsn is here */
+ /* out: TRUE if success, FALSE if could not get the
+ log system mutex */
+ dulint* lsn); /* out: if returns TRUE, current lsn is here */
/**************************************************************************
Refreshes the statistics used to print per-second averages. */
@@ -549,7 +549,7 @@ extern log_t* log_sys;
highest bit is set to 1 if this is the
first log block in a log flush write
segment */
-#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
/* mask used to get the highest bit in
the preceding field */
#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to
@@ -600,12 +600,18 @@ extern log_t* log_sys;
#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END
#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END)
#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END)
- /* current fsp free limit in the
- tablespace, in units of one megabyte */
+ /* current fsp free limit in
+ tablespace 0, in units of one
+ megabyte; this information is only used
+ by ibbackup to decide if it can
+ truncate unused ends of
+ non-auto-extending data files in space
+ 0 */
#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END)
/* this magic number tells if the
checkpoint contains the above field:
- the field was added to InnoDB-3.23.50 */
+ the field was added to
+ InnoDB-3.23.50 */
#define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END)
#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243
@@ -794,11 +800,11 @@ struct log_struct{
called */
/* Fields involved in checkpoints */
- ulint log_group_capacity; /* capacity of the log group; if
- the checkpoint age exceeds this, it is
- a serious error because it is possible
- we will then overwrite log and spoil
- crash recovery */
+ ulint log_group_capacity; /* capacity of the log group; if
+ the checkpoint age exceeds this, it is
+ a serious error because it is possible
+ we will then overwrite log and spoil
+ crash recovery */
ulint max_modified_age_async;
/* when this recommended value for lsn
- buf_pool_get_oldest_modification()
@@ -840,7 +846,8 @@ struct log_struct{
/* Fields involved in archiving */
ulint archiving_state;/* LOG_ARCH_ON, LOG_ARCH_STOPPING
LOG_ARCH_STOPPED, LOG_ARCH_OFF */
- dulint archived_lsn; /* archiving has advanced to this lsn */
+ dulint archived_lsn; /* archiving has advanced to this
+ lsn */
ulint max_archived_lsn_age_async;
/* recommended maximum age of
archived_lsn, before we start
diff --git a/innobase/include/log0log.ic b/innobase/include/log0log.ic
index 587291883f7..16423286f6d 100644
--- a/innobase/include/log0log.ic
+++ b/innobase/include/log0log.ic
@@ -182,9 +182,9 @@ log_block_convert_lsn_to_no(
no = ut_dulint_get_low(lsn) / OS_FILE_LOG_BLOCK_SIZE;
no += (ut_dulint_get_high(lsn) % OS_FILE_LOG_BLOCK_SIZE)
- * 2 * (0x80000000 / OS_FILE_LOG_BLOCK_SIZE);
+ * 2 * (0x80000000UL / OS_FILE_LOG_BLOCK_SIZE);
- no = no & 0x3FFFFFFF;
+ no = no & 0x3FFFFFFFUL;
return(no + 1);
}
@@ -206,7 +206,7 @@ log_block_calc_checksum(
sh = 0;
for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
- sum = sum & 0x7FFFFFFF;
+ sum = sum & 0x7FFFFFFFUL;
sum += (((ulint)(*(block + i))) << sh) + (ulint)(*(block + i));
sh++;
if (sh > 24) {
@@ -350,7 +350,7 @@ log_reserve_and_write_fast(
#ifdef UNIV_LOG_DEBUG
log_check_log_recs(log->buf + log->old_buf_free,
- log->buf_free - log->old_buf_free, log->old_lsn);
+ log->buf_free - log->old_buf_free, log->old_lsn);
#endif
return(lsn);
}
diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h
index e5a5bc05563..c972c3ce977 100644
--- a/innobase/include/log0recv.h
+++ b/innobase/include/log0recv.h
@@ -15,6 +15,8 @@ Created 9/20/1997 Heikki Tuuri
#include "hash0hash.h"
#include "log0log.h"
+extern ibool recv_replay_file_ops;
+
/***********************************************************************
Reads the checkpoint info needed in hot backup. */
@@ -25,8 +27,8 @@ recv_read_cp_info_for_backup(
byte* hdr, /* in: buffer containing the log group header */
dulint* lsn, /* out: checkpoint lsn */
ulint* offset, /* out: checkpoint offset in the log group */
- ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database
- is running with < version 3.23.50 of InnoDB */
+ ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the
+ database is running with < version 3.23.50 of InnoDB */
dulint* cp_no, /* out: checkpoint number */
dulint* first_header_lsn);
/* out: lsn of of the start of the first log file */
@@ -175,17 +177,14 @@ recv_apply_hashed_log_recs(
disk and invalidated in buffer pool: this
alternative means that no new log records
can be generated during the application */
+#ifdef UNIV_HOTBACKUP
/***********************************************************************
Applies log records in the hash table to a backup. */
void
-recv_apply_log_recs_for_backup(
-/*===========================*/
- ulint n_data_files, /* in: number of data files */
- char** data_files, /* in: array containing the paths to the
- data files */
- ulint* file_sizes); /* in: sizes of the data files in database
- pages */
+recv_apply_log_recs_for_backup(void);
+/*================================*/
+#endif
/************************************************************
Recovers from archived log files, and also from log files, if they exist. */
@@ -334,7 +333,6 @@ extern ibool recv_no_ibuf_operations;
extern ibool recv_needed_recovery;
extern ibool recv_lsn_checks_on;
-
extern ibool recv_is_making_a_backup;
extern ulint recv_max_parsed_page_no;
@@ -357,12 +355,7 @@ in the debug version: spaces with an odd number as the id are replicate
spaces */
#define RECV_REPLICA_SPACE_ADD 1
-/* This many blocks must be left free in the buffer pool when we scan
-the log and store the scanned log records in the buffer pool: we will
-use these free blocks to read in pages when we start applying the
-log records to the database. */
-
-#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8))
+extern ulint recv_n_pool_free_frames;
#ifndef UNIV_NONINL
#include "log0recv.ic"
diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic
index 65e5df2178e..3ccdcf1dc0a 100644
--- a/innobase/include/mach0data.ic
+++ b/innobase/include/mach0data.ic
@@ -17,7 +17,7 @@ mach_write_to_1(
ulint n) /* in: ulint integer to be stored, >= 0, < 256 */
{
ut_ad(b);
- ut_ad(n <= 0xFF);
+ ut_ad(n <= 0xFFUL);
b[0] = (byte)n;
}
@@ -46,7 +46,7 @@ mach_write_to_2(
ulint n) /* in: ulint integer to be stored */
{
ut_ad(b);
- ut_ad(n <= 0xFFFF);
+ ut_ad(n <= 0xFFFFUL);
b[0] = (byte)(n >> 8);
b[1] = (byte)(n);
@@ -79,7 +79,7 @@ mach_write_to_3(
ulint n) /* in: ulint integer to be stored */
{
ut_ad(b);
- ut_ad(n <= 0xFFFFFF);
+ ut_ad(n <= 0xFFFFFFUL);
b[0] = (byte)(n >> 16);
b[1] = (byte)(n >> 8);
@@ -184,20 +184,20 @@ mach_write_compressed(
{
ut_ad(b);
- if (n < 0x80) {
+ if (n < 0x80UL) {
mach_write_to_1(b, n);
return(1);
- } else if (n < 0x4000) {
- mach_write_to_2(b, n | 0x8000);
+ } else if (n < 0x4000UL) {
+ mach_write_to_2(b, n | 0x8000UL);
return(2);
- } else if (n < 0x200000) {
- mach_write_to_3(b, n | 0xC00000);
+ } else if (n < 0x200000UL) {
+ mach_write_to_3(b, n | 0xC00000UL);
return(3);
- } else if (n < 0x10000000) {
- mach_write_to_4(b, n | 0xE0000000);
+ } else if (n < 0x10000000UL) {
+ mach_write_to_4(b, n | 0xE0000000UL);
return(4);
} else {
- mach_write_to_1(b, 0xF0);
+ mach_write_to_1(b, 0xF0UL);
mach_write_to_4(b + 1, n);
return(5);
}
@@ -212,13 +212,13 @@ mach_get_compressed_size(
/* out: compressed size in bytes */
ulint n) /* in: ulint integer (< 2^32) to be stored */
{
- if (n < 0x80) {
+ if (n < 0x80UL) {
return(1);
- } else if (n < 0x4000) {
+ } else if (n < 0x4000UL) {
return(2);
- } else if (n < 0x200000) {
+ } else if (n < 0x200000UL) {
return(3);
- } else if (n < 0x10000000) {
+ } else if (n < 0x10000000UL) {
return(4);
} else {
return(5);
@@ -240,16 +240,16 @@ mach_read_compressed(
flag = mach_read_from_1(b);
- if (flag < 0x80) {
+ if (flag < 0x80UL) {
return(flag);
- } else if (flag < 0xC0) {
- return(mach_read_from_2(b) & 0x7FFF);
- } else if (flag < 0xE0) {
- return(mach_read_from_3(b) & 0x3FFFFF);
- } else if (flag < 0xF0) {
- return(mach_read_from_4(b) & 0x1FFFFFFF);
+ } else if (flag < 0xC0UL) {
+ return(mach_read_from_2(b) & 0x7FFFUL);
+ } else if (flag < 0xE0UL) {
+ return(mach_read_from_3(b) & 0x3FFFFFUL);
+ } else if (flag < 0xF0UL) {
+ return(mach_read_from_4(b) & 0x1FFFFFFFUL);
} else {
- ut_ad(flag == 0xF0);
+ ut_ad(flag == 0xF0UL);
return(mach_read_from_4(b + 1));
}
}
@@ -439,7 +439,7 @@ mach_dulint_write_much_compressed(
return(mach_write_compressed(b, ut_dulint_get_low(n)));
}
- *b = 0xFF;
+ *b = (byte)0xFF;
size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n));
size += mach_write_compressed(b + size, ut_dulint_get_low(n));
@@ -479,7 +479,7 @@ mach_dulint_read_much_compressed(
ut_ad(b);
- if (*b != 0xFF) {
+ if (*b != (byte)0xFF) {
high = 0;
size = 0;
} else {
@@ -679,11 +679,10 @@ mach_write_to_2_little_endian(
{
ut_ad(n < 256 * 256);
- *dest = (byte)(n & 0xFF);
+ *dest = (byte)(n & 0xFFUL);
n = n >> 8;
dest++;
- *dest = (byte)(n & 0xFF);
+ *dest = (byte)(n & 0xFFUL);
}
-
diff --git a/innobase/include/mem0pool.h b/innobase/include/mem0pool.h
index 43707bd5f61..51c53afe788 100644
--- a/innobase/include/mem0pool.h
+++ b/innobase/include/mem0pool.h
@@ -19,6 +19,8 @@ typedef struct mem_pool_struct mem_pool_t;
/* The common memory pool */
extern mem_pool_t* mem_comm_pool;
+extern ulint mem_out_of_mem_err_msg_count;
+
/* Memory area header */
struct mem_area_struct{
diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h
index 367c9a00651..f50c1dfcb6a 100644
--- a/innobase/include/mtr0log.h
+++ b/innobase/include/mtr0log.h
@@ -57,6 +57,19 @@ mlog_write_initial_log_record(
byte type, /* in: log item type: MLOG_1BYTE, ... */
mtr_t* mtr); /* in: mini-transaction handle */
/************************************************************
+Writes a log record about an .ibd file create/delete/rename. */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ /* out: new value of log_ptr */
+ ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/* in: space id, if applicable */
+ ulint page_no,/* in: page number (not relevant currently) */
+ byte* log_ptr,/* in: pointer to mtr log which has been opened */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************
Catenates 1 - 4 bytes to the mtr log. */
UNIV_INLINE
void
diff --git a/innobase/include/mtr0log.ic b/innobase/include/mtr0log.ic
index 54f15779078..60a5b390be9 100644
--- a/innobase/include/mtr0log.ic
+++ b/innobase/include/mtr0log.ic
@@ -163,13 +163,6 @@ mlog_write_initial_log_record_fast(
space = buf_block_get_space(block);
offset = buf_block_get_page_no(block);
- if (space != 0 || offset > 0x8FFFFFFF) {
- fprintf(stderr,
- "InnoDB: error: buffer page pointer %lx has nonsensical space id %lu\n"
- "InnoDB: or page no %lu\n", (ulint)ptr, space, offset);
- ut_error;
- }
-
mach_write_to_1(log_ptr, type);
log_ptr++;
log_ptr += mach_write_compressed(log_ptr, space);
@@ -192,3 +185,31 @@ mlog_write_initial_log_record_fast(
#endif
return(log_ptr);
}
+
+/************************************************************
+Writes a log record about an .ibd file create/delete/rename. */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ /* out: new value of log_ptr */
+ ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/* in: space id, if applicable */
+ ulint page_no,/* in: page number (not relevant currently) */
+ byte* log_ptr,/* in: pointer to mtr log which has been opened */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(log_ptr);
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+
+ /* We write dummy space id and page number */
+ log_ptr += mach_write_compressed(log_ptr, space_id);
+ log_ptr += mach_write_compressed(log_ptr, page_no);
+
+ mtr->n_log_recs++;
+
+ return(log_ptr);
+}
diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h
index d999b7cc5b7..9cf592f71e1 100644
--- a/innobase/include/mtr0mtr.h
+++ b/innobase/include/mtr0mtr.h
@@ -96,7 +96,13 @@ flag value must give the length also! */
sequence of these records */
#define MLOG_DUMMY_RECORD ((byte)32) /* dummy log record used to
pad a log block full */
-#define MLOG_BIGGEST_TYPE ((byte)32) /* biggest value (used in
+#define MLOG_FILE_CREATE ((byte)33) /* log record about an .ibd
+ file creation */
+#define MLOG_FILE_RENAME ((byte)34) /* log record about an .ibd
+ file rename */
+#define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd
+ file deletion */
+#define MLOG_BIGGEST_TYPE ((byte)35) /* biggest value (used in
asserts) */
/*******************************************************************
diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h
index b221bf7aef9..cf2dbd68fb1 100644
--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@@ -11,9 +11,11 @@ Created 10/21/1995 Heikki Tuuri
#include "univ.i"
+#ifndef __WIN__
+#include <dirent.h>
+#include <sys/stat.h>
+#endif
-/* If the following is set to TRUE, we do not call os_file_flush in every
-os_file_write */
extern ibool os_do_not_call_flush_at_each_write;
extern ibool os_has_said_disk_full;
extern ibool os_aio_print_debug;
@@ -60,9 +62,11 @@ log. */
#define OS_FILE_OPEN 51
#define OS_FILE_CREATE 52
#define OS_FILE_OVERWRITE 53
+#define OS_FILE_OPEN_RAW 54
#define OS_FILE_READ_ONLY 333
#define OS_FILE_READ_WRITE 444
+#define OS_FILE_READ_ALLOW_DELETE 555 /* for ibbackup */
/* Options for file_create */
#define OS_FILE_AIO 61
@@ -120,6 +124,36 @@ extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
extern ulint os_n_fsyncs;
+/* File types for directory entry data type */
+
+enum os_file_type_enum{
+ OS_FILE_TYPE_UNKNOWN = 0,
+ OS_FILE_TYPE_FILE, /* regular file */
+ OS_FILE_TYPE_DIR, /* directory */
+ OS_FILE_TYPE_LINK /* symbolic link */
+};
+typedef enum os_file_type_enum os_file_type_t;
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes */
+#define OS_FILE_MAX_PATH 4000
+
+/* Struct used in fetching information of a file in a directory */
+typedef struct os_file_stat_struct os_file_stat_t;
+struct os_file_stat_struct{
+ char name[OS_FILE_MAX_PATH]; /* path to a file */
+ os_file_type_t type; /* file type */
+ ib_longlong size; /* file size */
+};
+
+#ifdef __WIN__
+typedef HANDLE os_file_dir_t; /* directory stream */
+#else
+typedef DIR* os_file_dir_t; /* directory stream */
+#endif
+
/***************************************************************************
Gets the operating system version. Currently works only on Windows. */
@@ -133,6 +167,57 @@ Creates the seek mutexes used in positioned reads and writes. */
void
os_io_init_simple(void);
/*===================*/
+/***************************************************************************
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing. */
+
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ /* out: directory stream, NULL if error */
+ char* dirname, /* in: directory name; it must not contain
+ a trailing '\' or '/' */
+ ibool error_is_fatal);/* in: TRUE if we should treat an error as a
+ fatal error; if we try to open symlinks then
+ we do not wish a fatal error if it happens
+ not to be a directory */
+/***************************************************************************
+Closes a directory stream. */
+
+int
+os_file_closedir(
+/*=============*/
+ /* out: 0 if success, -1 if failure */
+ os_file_dir_t dir); /* in: directory stream */
+/***************************************************************************
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory. */
+
+int
+os_file_readdir_next_file(
+/*======================*/
+ /* out: 0 if ok, -1 if error, 1 if at the end
+ of the directory */
+ char* dirname,/* in: directory name or path */
+ os_file_dir_t dir, /* in: directory stream */
+ os_file_stat_t* info); /* in/out: buffer where the info is returned */
+/*********************************************************************
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true. */
+
+ibool
+os_file_create_directory(
+/*=====================*/
+ /* out: TRUE if call succeeds, FALSE on
+ error */
+ char* pathname, /* in: directory name as null-terminated
+ string */
+ ibool fail_if_exists);/* in: if TRUE, pre-existing directory is
+ treated as an error. */
/********************************************************************
A simple function to open or create a file. */
@@ -140,7 +225,8 @@ os_file_t
os_file_create_simple(
/*==================*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
@@ -155,13 +241,16 @@ os_file_t
os_file_create_simple_no_error_handling(
/*====================================*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
(if does not exist, error), or OS_FILE_CREATE if a new
file is created (if exists, error) */
- ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */
+ ulint access_type,/* in: OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is used by
+ a backup program reading the file */
ibool* success);/* out: TRUE if succeed, FALSE if error */
/********************************************************************
Opens an existing file or creates a new. */
@@ -170,13 +259,16 @@ os_file_t
os_file_create(
/*===========*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
(if does not exist, error), or OS_FILE_CREATE if a new
file is created (if exists, error), OS_FILE_OVERWRITE
- if a new file is created or an old overwritten */
+ if a new file is created or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk partition
+ should be opened */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file;
NOTE that it also depends on type, os_aio_.. and srv_..
@@ -186,6 +278,34 @@ os_file_create(
ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success);/* out: TRUE if succeed, FALSE if error */
/***************************************************************************
+Deletes a file. The file has to be closed before calling this. */
+
+ibool
+os_file_delete(
+/*===========*/
+ /* out: TRUE if success */
+ char* name); /* in: file path as a null-terminated string */
+
+/***************************************************************************
+Deletes a file if it exists. The file has to be closed before calling this. */
+
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+ /* out: TRUE if success */
+ char* name); /* in: file path as a null-terminated string */
+/***************************************************************************
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function. */
+
+ibool
+os_file_rename(
+/*===========*/
+ /* out: TRUE if success */
+ char* oldpath, /* in: old file path as a null-terminated
+ string */
+ char* newpath); /* in: new file path */
+/***************************************************************************
Closes a file handle. In case of error, error number can be retrieved with
os_file_get_last_error. */
@@ -214,6 +334,14 @@ os_file_get_size(
size */
ulint* size_high);/* out: most significant 32 bits of size */
/***************************************************************************
+Gets file size as a 64-bit integer ib_longlong. */
+
+ib_longlong
+os_file_get_size_as_iblonglong(
+/*===========================*/
+ /* out: size in bytes, -1 if error */
+ os_file_t file); /* in: handle to a file */
+/***************************************************************************
Sets a file size. This function can be used to extend or truncate a file. */
ibool
@@ -241,9 +369,12 @@ overwrite the error number). If the number is not known to this program,
the OS error number + 100 is returned. */
ulint
-os_file_get_last_error(void);
-/*========================*/
- /* out: error number, or OS error number + 100 */
+os_file_get_last_error(
+/*===================*/
+ /* out: error number, or OS error
+ number + 100 */
+ ibool report_all_errors); /* in: TRUE if we want an error message
+ printed of all errors */
/***********************************************************************
Requests a synchronous read operation. */
@@ -260,6 +391,23 @@ os_file_read(
offset */
ulint n); /* in: number of bytes to read */
/***********************************************************************
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE. */
+
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+ os_file_t file, /* in: handle to a file */
+ void* buf, /* in: buffer where to read */
+ ulint offset, /* in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/* in: most significant 32 bits of
+ offset */
+ ulint n); /* in: number of bytes to read */
+
+/***********************************************************************
Requests a synchronous write operation. */
ibool
diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h
index 7618032a11f..d0d3cf82e38 100644
--- a/innobase/include/os0proc.h
+++ b/innobase/include/os0proc.h
@@ -15,6 +15,76 @@ Created 9/30/1995 Heikki Tuuri
typedef void* os_process_t;
typedef unsigned long int os_process_id_t;
+/* The cell type in os_awe_allocate_mem page info */
+#if defined(__WIN2000__) && defined(ULONG_PTR)
+typedef ULONG_PTR os_awe_t;
+#else
+typedef ulint os_awe_t;
+#endif
+
+/* Physical page size when Windows AWE is used. This is the normal
+page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB
+pages. */
+#define OS_AWE_X86_PAGE_SIZE 4096
+
+/********************************************************************
+Windows AWE support. Tries to enable the "lock pages in memory" privilege for
+the current process so that the current process can allocate memory-locked
+virtual address space to act as the window where AWE maps physical memory. */
+
+ibool
+os_awe_enable_lock_pages_in_mem(void);
+/*=================================*/
+ /* out: TRUE if success, FALSE if error;
+ prints error info to stderr if no success */
+/********************************************************************
+Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86
+processor. */
+
+ibool
+os_awe_allocate_physical_mem(
+/*=========================*/
+ /* out: TRUE if success */
+ os_awe_t** page_info, /* out, own: array of opaque data containing
+ the info for allocated physical memory pages;
+ each allocated 4 kB physical memory page has
+ one slot of type os_awe_t in the array */
+ ulint n_megabytes); /* in: number of megabytes to allocate */
+/********************************************************************
+Allocates a window in the virtual address space where we can map then
+pages of physical memory. */
+
+byte*
+os_awe_allocate_virtual_mem_window(
+/*===============================*/
+ /* out, own: allocated memory, or NULL if did not
+ succeed */
+ ulint size); /* in: virtual memory allocation size in bytes, must
+ be < 2 GB */
+/********************************************************************
+With this function you can map parts of physical memory allocated with
+the ..._allocate_physical_mem to the virtual address space allocated with
+the previous function. Intel implements this so that the process page
+tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP
+showed that this takes < 1 microsecond, much better than the estimated 80 us
+for copying a 16 kB page memory to memory. But, the operation will at least
+partially invalidate the translation lookaside buffer (TLB) of all
+processors. Under a real-world load the performance hit may be bigger. */
+
+ibool
+os_awe_map_physical_mem_to_window(
+/*==============================*/
+ /* out: TRUE if success; the function
+ calls exit(1) in case of an error */
+ byte* ptr, /* in: a page-aligned pointer to
+ somewhere in the virtual address
+ space window; we map the physical mem
+ pages here */
+ ulint n_mem_pages, /* in: number of 4 kB mem pages to
+ map */
+ os_awe_t* page_info); /* in: array of page infos for those
+ pages; each page has one slot in the
+ array */
/********************************************************************
Converts the current process id to a number. It is not guaranteed that the
number is unique. In Linux returns the 'process number' of the current
diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h
index 04f771c3abd..969313614e3 100644
--- a/innobase/include/page0page.h
+++ b/innobase/include/page0page.h
@@ -596,7 +596,8 @@ byte*
page_parse_delete_rec_list(
/*=======================*/
/* out: end of log record or NULL */
- byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */
+ byte type, /* in: MLOG_LIST_END_DELETE or
+ MLOG_LIST_START_DELETE */
byte* ptr, /* in: buffer */
byte* end_ptr,/* in: buffer end */
page_t* page, /* in: page or NULL */
diff --git a/innobase/include/que0types.h b/innobase/include/que0types.h
index c7ce09db40b..e59c2313a5a 100644
--- a/innobase/include/que0types.h
+++ b/innobase/include/que0types.h
@@ -36,7 +36,8 @@ struct que_common_struct{
if the buffer has been allocated dynamically:
if this field is != 0, and the node is a
symbol node or a function node, then we
- have to free the data field in val explicitly */
+ have to free the data field in val
+ explicitly */
};
#endif
diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h
index b28f39925c1..ebdd3c1ac81 100644
--- a/innobase/include/rem0rec.h
+++ b/innobase/include/rem0rec.h
@@ -21,7 +21,7 @@ Created 5/30/1994 Heikki Tuuri
/* Flag denoting the predefined minimum record: this bit is ORed in the 4
info bits of a record */
-#define REC_INFO_MIN_REC_FLAG 0x10
+#define REC_INFO_MIN_REC_FLAG 0x10UL
/* Number of extra bytes in a record, in addition to the data and the
offsets */
@@ -406,8 +406,8 @@ rec_sprintf(
/* Maximum lengths for the data in a physical record if the offsets
are given in one byte (resp. two byte) format. */
-#define REC_1BYTE_OFFS_LIMIT 0x7F
-#define REC_2BYTE_OFFS_LIMIT 0x7FFF
+#define REC_1BYTE_OFFS_LIMIT 0x7FUL
+#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL
/* The data size of record must be smaller than this because we reserve
two upmost bits in a two byte offset for special purposes */
diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic
index 9dfd4faeec8..f4acd8547db 100644
--- a/innobase/include/rem0rec.ic
+++ b/innobase/include/rem0rec.ic
@@ -29,41 +29,41 @@ significant bytes and bits are written below less significant.
and the shift needed to obtain each bit-field of the record. */
#define REC_NEXT 2
-#define REC_NEXT_MASK 0xFFFF
+#define REC_NEXT_MASK 0xFFFFUL
#define REC_NEXT_SHIFT 0
#define REC_SHORT 3 /* This is single byte bit-field */
-#define REC_SHORT_MASK 0x1
+#define REC_SHORT_MASK 0x1UL
#define REC_SHORT_SHIFT 0
#define REC_N_FIELDS 4
-#define REC_N_FIELDS_MASK 0x7FE
+#define REC_N_FIELDS_MASK 0x7FEUL
#define REC_N_FIELDS_SHIFT 1
#define REC_HEAP_NO 5
-#define REC_HEAP_NO_MASK 0xFFF8
+#define REC_HEAP_NO_MASK 0xFFF8UL
#define REC_HEAP_NO_SHIFT 3
#define REC_N_OWNED 6 /* This is single byte bit-field */
-#define REC_N_OWNED_MASK 0xF
+#define REC_N_OWNED_MASK 0xFUL
#define REC_N_OWNED_SHIFT 0
-#define REC_INFO_BITS_MASK 0xF0
+#define REC_INFO_BITS_MASK 0xF0UL
#define REC_INFO_BITS_SHIFT 0
/* The deleted flag in info bits */
-#define REC_INFO_DELETED_FLAG 0x20 /* when bit is set to 1, it means the
+#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the
record has been delete marked */
/* The following masks are used to filter the SQL null bit from
one-byte and two-byte offsets */
-#define REC_1BYTE_SQL_NULL_MASK 0x80
-#define REC_2BYTE_SQL_NULL_MASK 0x8000
+#define REC_1BYTE_SQL_NULL_MASK 0x80UL
+#define REC_2BYTE_SQL_NULL_MASK 0x8000UL
/* In a 2-byte offset the second most significant bit denotes
a field stored to another page: */
-#define REC_2BYTE_EXTERN_MASK 0x4000
+#define REC_2BYTE_EXTERN_MASK 0x4000UL
/****************************************************************
Return field length or UNIV_SQL_NULL. */
@@ -133,7 +133,7 @@ rec_set_bit_field_1(
ut_ad(rec);
ut_ad(offs <= REC_N_EXTRA_BYTES);
ut_ad(mask);
- ut_ad(mask <= 0xFF);
+ ut_ad(mask <= 0xFFUL);
ut_ad(((mask >> shift) << shift) == mask);
ut_ad(((val << shift) & mask) == (val << shift));
@@ -172,8 +172,8 @@ rec_set_bit_field_2(
{
ut_ad(rec);
ut_ad(offs <= REC_N_EXTRA_BYTES);
- ut_ad(mask > 0xFF);
- ut_ad(mask <= 0xFFFF);
+ ut_ad(mask > 0xFFUL);
+ ut_ad(mask <= 0xFFFFUL);
ut_ad((mask >> shift) & 1);
ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
ut_ad(((mask >> shift) << shift) == mask);
@@ -188,8 +188,8 @@ rec_set_bit_field_2(
+ (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4)))
+ (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4)))
+ (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3)))
- + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3))));
- if (m != ut_dbg_zero + 0xFFFFFFFF) {
+ + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3))));
+ if (m != ut_dbg_zero + 0xFFFFFFFFUL) {
printf("Sum of masks %lx\n", m);
ut_error;
}
diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h
index 940b4c61b2f..fade3709631 100644
--- a/innobase/include/row0mysql.h
+++ b/innobase/include/row0mysql.h
@@ -52,6 +52,14 @@ row_mysql_read_var_ref_noninline(
ulint* len, /* out: variable-length field length */
byte* field); /* in: field */
/***********************************************************************
+Frees the blob heap in prebuilt when no longer needed. */
+
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt); /* in: prebuilt struct of a
+ ha_innobase:: table handle */
+/***********************************************************************
Stores a reference to a BLOB in the MySQL format. */
void
@@ -331,6 +339,45 @@ row_drop_table_for_mysql(
char* name, /* in: table name */
trx_t* trx); /* in: transaction handle */
/*************************************************************************
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+
+How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do DISCARD TABLESPACE. Then there are no running
+queries on the table.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: we remove all entries for the tablespace in the insert
+buffer tree; as long as the tablespace mem object does not exist, ongoing
+insert buffer page merges are discarded in buf0rea.c. If we recreate the
+tablespace mem object with IMPORT TABLESPACE later, then the tablespace will
+have the same id, but the tablespace_version field in the mem object is
+different, and ongoing old insert buffer page merges get discarded.
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations. */
+
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx); /* in: transaction handle */
+/*********************************************************************
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary. */
+
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx); /* in: transaction handle */
+/*************************************************************************
Drops a database for MySQL. */
int
diff --git a/innobase/include/row0sel.h b/innobase/include/row0sel.h
index 5ef7ff9399a..a35d588ad08 100644
--- a/innobase/include/row0sel.h
+++ b/innobase/include/row0sel.h
@@ -118,7 +118,8 @@ row_search_for_mysql(
/*=================*/
/* out: DB_SUCCESS,
DB_RECORD_NOT_FOUND,
- DB_END_OF_INDEX, or DB_DEADLOCK */
+ DB_END_OF_INDEX, DB_DEADLOCK,
+ or DB_TOO_BIG_RECORD */
byte* buf, /* in/out: buffer for the fetched
row in the MySQL format */
ulint mode, /* in: search mode PAGE_CUR_L, ... */
diff --git a/innobase/include/row0sel.ic b/innobase/include/row0sel.ic
index 9005624b6ca..994638790c0 100644
--- a/innobase/include/row0sel.ic
+++ b/innobase/include/row0sel.ic
@@ -77,7 +77,7 @@ open_step(
if (err != DB_SUCCESS) {
/* SQL error detected */
- printf("SQL error %lu\n", err);
+ printf("SQL error %lu\n", (unsigned long) err);
ut_error;
que_thr_handle_error(thr, err, NULL, 0);
diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic
index 3e00978be2f..6b9deeac5e3 100644
--- a/innobase/include/row0upd.ic
+++ b/innobase/include/row0upd.ic
@@ -86,8 +86,8 @@ upd_field_set_field_no(
fprintf(stderr,
"InnoDB: Error: trying to access field %lu in table %s\n"
"InnoDB: index %s, but index has only %lu fields\n",
- field_no, index->table_name, index->name,
- dict_index_get_n_fields(index));
+ (unsigned long) field_no, index->table_name, index->name,
+ (unsigned long) dict_index_get_n_fields(index));
}
dtype_copy(dfield_get_type(&(upd_field->new_val)),
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
index 769d55fb66c..8aac71de2a9 100644
--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@@ -37,6 +37,8 @@ extern ibool srv_lower_case_table_names;
extern char* srv_data_home;
extern char* srv_arch_dir;
+extern ibool srv_file_per_table;
+
extern ulint srv_n_data_files;
extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
@@ -62,6 +64,7 @@ extern ulint srv_flush_log_at_trx_commit;
extern byte srv_latin1_ordering[256];/* The sort order table of the latin1
character set */
extern ulint srv_pool_size;
+extern ulint srv_awe_window_size;
extern ulint srv_mem_pool_size;
extern ulint srv_lock_table_size;
@@ -76,12 +79,14 @@ extern char* srv_file_flush_method_str;
extern ulint srv_unix_file_flush_method;
extern ulint srv_win_file_flush_method;
+extern ulint srv_max_n_open_files;
+
extern ulint srv_max_dirty_pages_pct;
extern ulint srv_force_recovery;
extern ulint srv_thread_concurrency;
-extern ulint srv_max_n_threads;
+extern ulint srv_max_n_threads;
extern lint srv_conc_n_threads;
@@ -92,6 +97,8 @@ extern ibool srv_use_doublewrite_buf;
extern ibool srv_set_thread_priorities;
extern int srv_query_thread_priority;
+extern ibool srv_use_awe;
+extern ibool srv_use_adaptive_hash_indexes;
/*-------------------------------------------*/
extern ulint srv_n_rows_inserted;
diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h
index c4c8dac5d7a..0074de537c3 100644
--- a/innobase/include/srv0start.h
+++ b/innobase/include/srv0start.h
@@ -11,6 +11,7 @@ Created 10/10/1995 Heikki Tuuri
#define srv0start_h
#include "univ.i"
+#include "ut0byte.h"
/*************************************************************************
Normalizes a directory path for Windows: converts slashes to backslashes. */
@@ -69,12 +70,17 @@ innobase_shutdown_for_mysql(void);
/*=============================*/
/* out: DB_SUCCESS or error code */
+extern dulint srv_shutdown_lsn;
+extern dulint srv_start_lsn;
+
extern ulint srv_sizeof_trx_t_in_ha_innodb_cc;
extern ibool srv_is_being_started;
extern ibool srv_startup_is_before_trx_rollback_phase;
extern ibool srv_is_being_shut_down;
+extern ibool srv_start_raw_disk_in_use;
+
/* At a shutdown the value first climbs from 0 to SRV_SHUTDOWN_CLEANUP
and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
@@ -84,4 +90,7 @@ extern ulint srv_shutdown_state;
#define SRV_SHUTDOWN_LAST_PHASE 2
#define SRV_SHUTDOWN_EXIT_THREADS 3
+/* Log 'spaces' have id's >= this */
+#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL
+
#endif
diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
index 3acf3415889..3a7203bbb56 100644
--- a/innobase/include/sync0sync.h
+++ b/innobase/include/sync0sync.h
@@ -376,8 +376,8 @@ or row lock! */
#define SYNC_IBUF_HEADER 914
#define SYNC_IBUF_PESS_INSERT_MUTEX 912
#define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below
- SYNC_FSP_PAGE: we assign value this
- high only to get the program to pass
+ SYNC_FSP_PAGE: we assign a value this
+ high only to make the program to pass
the debug checks */
/*-------------------------------*/
#define SYNC_INDEX_TREE 900
@@ -396,7 +396,7 @@ or row lock! */
#define SYNC_FSP_PAGE 395
/*------------------------------------- Insert buffer headers */
/*------------------------------------- ibuf_mutex */
-/*------------------------------------- Insert buffer trees */
+/*------------------------------------- Insert buffer tree */
#define SYNC_IBUF_BITMAP_MUTEX 351
#define SYNC_IBUF_BITMAP 350
/*-------------------------------*/
diff --git a/innobase/include/trx0rseg.ic b/innobase/include/trx0rseg.ic
index 9a6137eb2e5..35e927f5e79 100644
--- a/innobase/include/trx0rseg.ic
+++ b/innobase/include/trx0rseg.ic
@@ -67,7 +67,7 @@ trx_rsegf_get_nth_undo(
{
if (n >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: trying to get slot %lu of rseg\n", n);
+ "InnoDB: Error: trying to get slot %lu of rseg\n", (unsigned long) n);
ut_error;
}
@@ -88,7 +88,7 @@ trx_rsegf_set_nth_undo(
{
if (n >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: trying to set slot %lu of rseg\n", n);
+ "InnoDB: Error: trying to set slot %lu of rseg\n", (unsigned long) n);
ut_error;
}
diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h
index a8ed675a8a5..0005c4a1711 100644
--- a/innobase/include/trx0sys.h
+++ b/innobase/include/trx0sys.h
@@ -24,18 +24,6 @@ Created 3/26/1996 Heikki Tuuri
#include "fsp0fsp.h"
#include "read0types.h"
-/* Do NOT merge this to the 4.1 code base! */
-extern ibool trx_sys_downgrading_from_4_1_1;
-
-/********************************************************************
-Do NOT merge this to the 4.1 code base!
-Marks the trx sys header when we have successfully downgraded from the >= 4.1.1
-multiple tablespace format back to the 4.0 format. */
-
-void
-trx_sys_mark_downgraded_from_4_1_1(void);
-/*====================================*/
-
/* In a MySQL replication slave, in crash recovery we store the master log
file name and position here. We have successfully got the updates to InnoDB
up to this position. If .._pos is -1, it means no crash recovery was needed,
@@ -49,21 +37,35 @@ extern trx_sys_t* trx_sys;
/* Doublewrite system */
extern trx_doublewrite_t* trx_doublewrite;
+extern ibool trx_doublewrite_must_reset_space_ids;
+extern ibool trx_sys_multiple_tablespace_format;
/********************************************************************
-Creates the doublewrite buffer at a database start. The header of the
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
doublewrite buffer is placed on the trx system header page. */
void
trx_sys_create_doublewrite_buf(void);
/*================================*/
/********************************************************************
-At a database startup uses a possible doublewrite buffer to restore
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
half-written pages in the data files. */
void
-trx_sys_doublewrite_restore_corrupt_pages(void);
-/*===========================================*/
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages);
+/********************************************************************
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void);
+/*===============================================*/
/********************************************************************
Determines if a page number is located inside the doublewrite buffer. */
@@ -367,14 +369,17 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
to disk, we still may be able
to recover the information */
#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
- /* If this is set to
- .._N, then we are
- DOWNGRADING from >= 4.1.1 to
- 4.0 */
+ /* If this is not yet set to
+ .._N, we must reset the
+ doublewrite buffer, because
+ starting from 4.1.x the space
+ id of a data page is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO */
/*-------------------------------------------------------------*/
#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
+
#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
/* Doublewrite control struct */
diff --git a/innobase/include/univ.i b/innobase/include/univ.i
index 4854e5a7b78..cd471a89607 100644
--- a/innobase/include/univ.i
+++ b/innobase/include/univ.i
@@ -65,13 +65,7 @@ Microsoft Visual C++ */
#define HAVE_PWRITE
#endif
-/* Apparently in some old SCO Unixes the return type of sprintf is not
-an integer as it should be according to the modern Posix standard. Because
-of that we define sprintf inside InnoDB code as our own function ut_sprintf */
-#undef sprintf
-#define sprintf ut_sprintf
-
-#endif
+#endif /* #if (defined(WIN32) || ... */
/* DEBUG VERSION CONTROL
===================== */
@@ -88,10 +82,9 @@ memory is read outside the allocated blocks. */
/*
#define UNIV_DEBUG
-#define UNIV_SYNC_DEBUG
#define UNIV_MEM_DEBUG
-
#define UNIV_IBUF_DEBUG
+#define UNIV_SYNC_DEBUG
#define UNIV_SEARCH_DEBUG
#define UNIV_SYNC_PERF_STAT
#define UNIV_SEARCH_PERF_STAT
@@ -182,27 +175,37 @@ management to ensure correct alignment for doubles etc. */
*/
/* Note that inside MySQL 'byte' is defined as char on Linux! */
-#define byte unsigned char
+#define byte unsigned char
-/* Another basic type we use is unsigned long integer which is intended to be
-equal to the word size of the machine. */
+/* Another basic type we use is unsigned long integer which should be equal to
+the word size of the machine, that is on a 32-bit platform 32 bits, and on a
+64-bit platform 64 bits. We also give the printf format for the type as a
+macro PRULINT. */
#ifdef _WIN64
typedef unsigned __int64 ulint;
+#define ULINTPF "%I64u"
+typedef __int64 lint;
#else
typedef unsigned long int ulint;
-#endif
-
+#define ULINTPF "%lu"
typedef long int lint;
+#endif
#ifdef __WIN__
-typedef __int64 ib_longlong;
+typedef __int64 ib_longlong;
#else
-typedef longlong ib_longlong;
+typedef longlong ib_longlong;
+#endif
+
+#ifndef __WIN__
+#if SIZEOF_LONG != SIZEOF_VOIDP
+#error "Error: InnoDB's ulint must be of the same size as void*"
+#endif
#endif
/* The following type should be at least a 64-bit floating point number */
-typedef double utfloat;
+typedef double utfloat;
/* The 'undefined' value for a ulint */
#define ULINT_UNDEFINED ((ulint)(-1))
@@ -215,7 +218,7 @@ typedef double utfloat;
/* This 'ibool' type is used within Innobase. Remember that different included
headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
-#define ibool ulint
+#define ibool ulint
#ifndef TRUE
diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h
index 4fb45221899..4274956421e 100644
--- a/innobase/include/ut0byte.h
+++ b/innobase/include/ut0byte.h
@@ -152,7 +152,7 @@ ut_dulint_align_up(
Increments a dulint variable by 1. */
#define UT_DULINT_INC(D)\
{\
- if ((D).low == 0xFFFFFFFF) {\
+ if ((D).low == 0xFFFFFFFFUL) {\
(D).high = (D).high + 1;\
(D).low = 0;\
} else {\
diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic
index f0df9cc35a3..5a70dcf12a8 100644
--- a/innobase/include/ut0byte.ic
+++ b/innobase/include/ut0byte.ic
@@ -152,13 +152,13 @@ ut_dulint_add(
dulint a, /* in: dulint */
ulint b) /* in: ulint */
{
- if (0xFFFFFFFF - b >= a.low) {
+ if (0xFFFFFFFFUL - b >= a.low) {
a.low += b;
return(a);
}
- a.low = a.low - (0xFFFFFFFF - b) - 1;
+ a.low = a.low - (0xFFFFFFFFUL - b) - 1;
a.high++;
@@ -183,7 +183,7 @@ ut_dulint_subtract(
b -= a.low + 1;
- a.low = 0xFFFFFFFF - b;
+ a.low = 0xFFFFFFFFUL - b;
ut_ad(a.high > 0);
@@ -214,7 +214,7 @@ ut_dulint_minus(
ut_ad(a.high == b.high + 1);
- diff = (ulint)(0xFFFFFFFF - b.low);
+ diff = (ulint)(0xFFFFFFFFUL - b.low);
diff += 1 + a.low;
ut_ad(diff > a.low);
diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h
index bec9cdd42b5..085b4811a73 100644
--- a/innobase/include/ut0dbg.h
+++ b/innobase/include/ut0dbg.h
@@ -27,7 +27,7 @@ extern const char* ut_dbg_msg_stop;
if (!((ulint)(EXPR) + ut_dbg_zero)) {\
ut_print_timestamp(stderr);\
fprintf(stderr, ut_dbg_msg_assert_fail,\
- os_thread_pf(os_thread_get_curr_id()), __FILE__,\
+ os_thread_pf(os_thread_get_curr_id()), IB__FILE__,\
(ulint)__LINE__);\
fputs("InnoDB: Failing assertion: " #EXPR "\n", stderr);\
fputs(ut_dbg_msg_trap, stderr);\
@@ -36,7 +36,7 @@ extern const char* ut_dbg_msg_stop;
}\
if (ut_dbg_stop_threads) {\
fprintf(stderr, ut_dbg_msg_stop,\
- os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\
+ os_thread_pf(os_thread_get_curr_id()), IB__FILE__, (ulint)__LINE__);\
os_thread_sleep(1000000000);\
}\
} while (0)
@@ -44,21 +44,20 @@ extern const char* ut_dbg_msg_stop;
#define ut_error do {\
ut_print_timestamp(stderr);\
fprintf(stderr, ut_dbg_msg_assert_fail,\
- os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\
+ os_thread_pf(os_thread_get_curr_id()), IB__FILE__, (ulint)__LINE__);\
fprintf(stderr, ut_dbg_msg_trap);\
ut_dbg_stop_threads = TRUE;\
if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL;\
} while (0)
#ifdef UNIV_DEBUG
-# define ut_ad(EXPR) ut_a(EXPR)
-# define ut_d(EXPR) do {EXPR;} while (0)
+#define ut_ad(EXPR) ut_a(EXPR)
+#define ut_d(EXPR) do {EXPR;} while (0)
#else
-# define ut_ad(EXPR)
-# define ut_d(EXPR)
+#define ut_ad(EXPR)
+#define ut_d(EXPR)
#endif
#define UT_NOT_USED(A) A = A
#endif
-
diff --git a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h
index fea6fc243d8..ce8aabeca41 100644
--- a/innobase/include/ut0mem.h
+++ b/innobase/include/ut0mem.h
@@ -50,6 +50,16 @@ ut_malloc(
/* out, own: allocated memory */
ulint n); /* in: number of bytes to allocate */
/**************************************************************************
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails. */
+
+ibool
+ut_test_malloc(
+/*===========*/
+ /* out: TRUE if succeeded */
+ ulint n); /* in: try to allocate this many bytes */
+/**************************************************************************
Frees a memory bloock allocated with ut_malloc. */
void
diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h
index 8ec23b23dcd..4517b8f8d40 100644
--- a/innobase/include/ut0ut.h
+++ b/innobase/include/ut0ut.h
@@ -19,14 +19,47 @@ typedef time_t ib_time_t;
/************************************************************
-Uses vsprintf to emulate sprintf so that the function always returns
-the printed length. Apparently in some old SCO Unixes sprintf did not
-return the printed length but a pointer to the end of the printed string. */
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
-ulint
+int
+ut_printf(
+/*======*/
+ /* out: the number of characters written, or
+ negative in case of an error */
+ const char* format, /* in: format of prints */
+ ...); /* in: arguments to be printed */
+/************************************************************
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
+
+int
ut_sprintf(
/*=======*/
- char* buf, /* in/out: buffer where to print */
+ /* out: the number of characters written, or
+ negative in case of an error */
+ char* buf, /* in: buffer where to print */
+ const char* format, /* in: format of prints */
+ ...); /* in: arguments to be printed */
+/************************************************************
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
+
+int
+ut_fprintf(
+/*=======*/
+ /* out: the number of characters written, or
+ negative in case of an error */
+ FILE* stream, /* in: stream where to print */
const char* format, /* in: format of prints */
...); /* in: arguments to be printed */
/************************************************************
@@ -139,7 +172,7 @@ void
ut_ulint_sort(ulint* arr, ulint* aux_arr, ulint low, ulint high);
/*============================================================*/
/************************************************************
-The following function returns a clock time in milliseconds. */
+The following function returns elapsed CPU time in milliseconds. */
ulint
ut_clock(void);
@@ -174,6 +207,14 @@ ut_sprintf_timestamp(
/*=================*/
char* buf); /* in: buffer where to sprintf */
/**************************************************************
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+ char* buf); /* in: buffer where to sprintf */
+/**************************************************************
Returns current year, month, day. */
void
diff --git a/innobase/include/ut0ut.ic b/innobase/include/ut0ut.ic
index 9d7dd283f29..9a0ef1c0d5b 100644
--- a/innobase/include/ut0ut.ic
+++ b/innobase/include/ut0ut.ic
@@ -110,7 +110,7 @@ ut_2pow_remainder(
ulint n, /* in: number to be divided */
ulint m) /* in: divisor; power of 2 */
{
- ut_ad(0x80000000 % m == 0);
+ ut_ad(0x80000000UL % m == 0);
return(n & (m - 1));
}
@@ -125,7 +125,7 @@ ut_2pow_round(
ulint n, /* in: number to be rounded */
ulint m) /* in: divisor; power of 2 */
{
- ut_ad(0x80000000 % m == 0);
+ ut_ad(0x80000000UL % m == 0);
return(n & ~(m - 1));
}
diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c
index c706ebceaec..2430380d65c 100644
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@@ -375,10 +375,10 @@ lock_check_trx_id_sanity(
"InnoDB: is %lu %lu which is higher than the global trx id counter %lu %lu!\n"
"InnoDB: The table is corrupt. You have to do dump + drop + reimport.\n",
err_buf, index->table_name, index->name,
- ut_dulint_get_high(trx_id),
- ut_dulint_get_low(trx_id),
- ut_dulint_get_high(trx_sys->max_trx_id),
- ut_dulint_get_low(trx_sys->max_trx_id));
+ (ulong) ut_dulint_get_high(trx_id),
+ (ulong) ut_dulint_get_low(trx_id),
+ (ulong) ut_dulint_get_high(trx_sys->max_trx_id),
+ (ulong) ut_dulint_get_low(trx_sys->max_trx_id));
is_ok = FALSE;
}
@@ -1657,7 +1657,8 @@ index->table_name);
if (lock_print_waits) {
printf("Lock wait for trx %lu in index %s\n",
- ut_dulint_get_low(trx->id), index->name);
+ (ulong) ut_dulint_get_low(trx->id),
+ index->name);
}
return(DB_LOCK_WAIT);
@@ -1996,7 +1997,7 @@ lock_grant(
if (lock_print_waits) {
printf("Lock wait for trx %lu ends\n",
- ut_dulint_get_low(lock->trx->id));
+ (ulong) ut_dulint_get_low(lock->trx->id));
}
/* If we are resolving a deadlock by choosing another transaction
@@ -3564,7 +3565,8 @@ lock_release_off_kernel(
ut_ad(lock_get_type(lock) == LOCK_TABLE);
if (lock_get_mode(lock) != LOCK_IS
- && (trx->insert_undo || trx->update_undo)) {
+ && 0 != ut_dulint_cmp(trx->undo_no,
+ ut_dulint_zero)) {
/* The trx may have modified the table.
We block the use of the MySQL query cache
@@ -3713,7 +3715,7 @@ lock_table_print(
buf += sprintf(buf, "TABLE LOCK table %s trx id %lu %lu",
lock->un_member.tab_lock.table->name,
- (lock->trx)->id.high, (lock->trx)->id.low);
+ (ulong) (lock->trx)->id.high, (ulong) (lock->trx)->id.low);
if (lock_get_mode(lock) == LOCK_S) {
buf += sprintf(buf, " lock mode S");
@@ -3727,7 +3729,7 @@ lock_table_print(
buf += sprintf(buf, " lock_mode AUTO-INC");
} else {
buf += sprintf(buf,
- " unknown lock_mode %lu", lock_get_mode(lock));
+ " unknown lock_mode %lu", (ulong) lock_get_mode(lock));
}
if (lock_get_wait(lock)) {
@@ -3764,11 +3766,13 @@ lock_rec_print(
page_no = lock->un_member.rec_lock.page_no;
buf += sprintf(buf, "RECORD LOCKS space id %lu page no %lu n bits %lu",
- space, page_no, lock_rec_get_n_bits(lock));
+ (ulong) space, (ulong) page_no,
+ (ulong) lock_rec_get_n_bits(lock));
buf += sprintf(buf, " table %s index %s trx id %lu %lu",
- lock->index->table->name, lock->index->name,
- (lock->trx)->id.high, (lock->trx)->id.low);
+ lock->index->table->name, lock->index->name,
+ (ulong) (lock->trx)->id.high,
+ (ulong) (lock->trx)->id.low);
if (lock_get_mode(lock) == LOCK_S) {
buf += sprintf(buf, " lock mode S");
@@ -3838,7 +3842,8 @@ lock_rec_print(
if (lock_rec_get_nth_bit(lock, i)) {
- buf += sprintf(buf, "Record lock, heap no %lu ", i);
+ buf += sprintf(buf, "Record lock, heap no %lu ",
+ (ulong) i);
if (page) {
buf += rec_sprintf(buf, 120,
@@ -3943,19 +3948,19 @@ lock_print_info(
"------------\n");
buf += sprintf(buf, "Trx id counter %lu %lu\n",
- ut_dulint_get_high(trx_sys->max_trx_id),
- ut_dulint_get_low(trx_sys->max_trx_id));
+ (ulong) ut_dulint_get_high(trx_sys->max_trx_id),
+ (ulong) ut_dulint_get_low(trx_sys->max_trx_id));
buf += sprintf(buf,
"Purge done for trx's n:o < %lu %lu undo n:o < %lu %lu\n",
- ut_dulint_get_high(purge_sys->purge_trx_no),
- ut_dulint_get_low(purge_sys->purge_trx_no),
- ut_dulint_get_high(purge_sys->purge_undo_no),
- ut_dulint_get_low(purge_sys->purge_undo_no));
+ (ulong) ut_dulint_get_high(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_high(purge_sys->purge_undo_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_undo_no));
buf += sprintf(buf,
"Total number of lock structs in row lock hash table %lu\n",
- lock_get_n_rec_locks());
+ (ulong) lock_get_n_rec_locks());
buf += sprintf(buf, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
@@ -4027,16 +4032,16 @@ loop:
if (trx->read_view) {
buf += sprintf(buf,
"Trx read view will not see trx with id >= %lu %lu, sees < %lu %lu\n",
- ut_dulint_get_high(trx->read_view->low_limit_id),
- ut_dulint_get_low(trx->read_view->low_limit_id),
- ut_dulint_get_high(trx->read_view->up_limit_id),
- ut_dulint_get_low(trx->read_view->up_limit_id));
+ (ulong) ut_dulint_get_high(trx->read_view->low_limit_id),
+ (ulong) ut_dulint_get_low(trx->read_view->low_limit_id),
+ (ulong) ut_dulint_get_high(trx->read_view->up_limit_id),
+ (ulong) ut_dulint_get_low(trx->read_view->up_limit_id));
}
if (trx->que_state == TRX_QUE_LOCK_WAIT) {
buf += sprintf(buf,
"------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n",
- (ulint)difftime(time(NULL), trx->wait_started));
+ (ulong)difftime(time(NULL), trx->wait_started));
if (lock_get_type(trx->wait_lock) == LOCK_REC) {
lock_rec_print(buf, trx->wait_lock);
@@ -4333,7 +4338,8 @@ loop:
index = lock->index;
rec = page_find_rec_with_heap_no(page, i);
- printf("Validating %lu %lu\n", space, page_no);
+ printf("Validating %lu %lu\n", (ulong) space,
+ (ulong) page_no);
lock_mutex_exit_kernel();
diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c
index ec0db57564a..095d84f6527 100644
--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -24,7 +24,8 @@ Created 12/9/1995 Heikki Tuuri
#include "trx0sys.h"
#include "trx0trx.h"
-/* Current free limit; protected by the log sys mutex; 0 means uninitialized */
+/* Current free limit of space 0; protected by the log sys mutex; 0 means
+uninitialized */
ulint log_fsp_current_free_limit = 0;
/* Global log system variable */
@@ -94,14 +95,6 @@ static
void
log_io_complete_archive(void);
/*=========================*/
-/********************************************************************
-Tries to establish a big enough margin of free space in the log groups, such
-that a new log entry can be catenated without an immediate need for a
-archiving. */
-static
-void
-log_archive_margin(void);
-/*====================*/
/********************************************************************
Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
@@ -197,11 +190,10 @@ loop:
if (log->archiving_state != LOG_ARCH_OFF) {
- archived_lsn_age = ut_dulint_minus(log->lsn, log->archived_lsn);
-
+ archived_lsn_age = ut_dulint_minus(log->lsn,
+ log->archived_lsn);
if (archived_lsn_age + len_upper_limit
> log->max_archived_lsn_age) {
-
/* Not enough free archived space in log groups: do a
synchronous archive write batch: */
@@ -356,7 +348,8 @@ log_close(void)
"InnoDB: If you are using big BLOB or TEXT rows, you must set the\n"
"InnoDB: combined size of log files at least 10 times bigger than the\n"
"InnoDB: largest such row.\n",
- checkpoint_age, log->log_group_capacity);
+ (ulong) checkpoint_age,
+ (ulong) log->log_group_capacity);
}
}
@@ -478,7 +471,8 @@ ulint
log_group_calc_lsn_offset(
/*======================*/
/* out: offset within the log group */
- dulint lsn, /* in: lsn, must be within 4 GB of group->lsn */
+ dulint lsn, /* in: lsn, must be within 4 GB of
+ group->lsn */
log_group_t* group) /* in: log group */
{
dulint gr_lsn;
@@ -771,7 +765,8 @@ log_init(void)
memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
/*----------------------------*/
- log_sys->archiving_state = LOG_ARCH_ON;
+ /* Under MySQL, log archiving is always off */
+ log_sys->archiving_state = LOG_ARCH_OFF;
log_sys->archived_lsn = log_sys->lsn;
log_sys->next_archived_lsn = ut_dulint_zero;
@@ -780,13 +775,15 @@ log_init(void)
rw_lock_create(&(log_sys->archive_lock));
rw_lock_set_level(&(log_sys->archive_lock), SYNC_NO_ORDER_CHECK);
- log_sys->archive_buf = ut_align(
+ log_sys->archive_buf = NULL;
+
+ /* ut_align(
ut_malloc(LOG_ARCHIVE_BUF_SIZE
+ OS_FILE_LOG_BLOCK_SIZE),
- OS_FILE_LOG_BLOCK_SIZE);
- log_sys->archive_buf_size = LOG_ARCHIVE_BUF_SIZE;
+ OS_FILE_LOG_BLOCK_SIZE); */
+ log_sys->archive_buf_size = 0;
- memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE);
+ /* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
log_sys->archiving_on = os_event_create(NULL);
@@ -933,7 +930,8 @@ log_group_check_flush_completion(
if (!log_sys->one_flushed && group->n_pending_writes == 0) {
if (log_debug_writes) {
- printf("Log flushed first to group %lu\n", group->id);
+ printf("Log flushed first to group %lu\n",
+ (ulong) group->id);
}
log_sys->written_to_some_lsn = log_sys->write_lsn;
@@ -944,7 +942,7 @@ log_group_check_flush_completion(
if (log_debug_writes && (group->n_pending_writes == 0)) {
- printf("Log flushed to group %lu\n", group->id);
+ printf("Log flushed to group %lu\n", (ulong) group->id);
}
return(0);
@@ -1011,7 +1009,7 @@ log_io_complete(
return;
}
- if ((ulint)group & 0x1) {
+ if ((ulint)group & 0x1UL) {
/* It was a checkpoint write */
group = (log_group_t*)((ulint)group - 1);
@@ -1088,8 +1086,8 @@ log_group_file_header_flush(
if (log_debug_writes) {
printf(
- "Writing log file header to group %lu file %lu\n", group->id,
- nth_file);
+ "Writing log file header to group %lu file %lu\n",
+ (ulong) group->id, (ulong) nth_file);
}
if (log_do_write) {
@@ -1169,7 +1167,8 @@ loop:
if ((next_offset % group->file_size) + len > group->file_size) {
- write_len = group->file_size - (next_offset % group->file_size);
+ write_len = group->file_size
+ - (next_offset % group->file_size);
} else {
write_len = len;
}
@@ -1179,13 +1178,14 @@ loop:
printf(
"Writing log file segment to group %lu offset %lu len %lu\n"
"start lsn %lu %lu\n",
- group->id, next_offset, write_len,
- ut_dulint_get_high(start_lsn),
- ut_dulint_get_low(start_lsn));
+ (ulong) group->id, (ulong) next_offset,
+ (ulong) write_len,
+ (ulong) ut_dulint_get_high(start_lsn),
+ (ulong) ut_dulint_get_low(start_lsn));
printf(
"First block n:o %lu last block n:o %lu\n",
- log_block_get_hdr_no(buf),
- log_block_get_hdr_no(
+ (ulong) log_block_get_hdr_no(buf),
+ (ulong) log_block_get_hdr_no(
buf + write_len - OS_FILE_LOG_BLOCK_SIZE));
ut_a(log_block_get_hdr_no(buf)
== log_block_convert_lsn_to_no(start_lsn));
@@ -1326,10 +1326,10 @@ loop:
if (log_debug_writes) {
printf("Writing log from %lu %lu up to lsn %lu %lu\n",
- ut_dulint_get_high(log_sys->written_to_all_lsn),
- ut_dulint_get_low(log_sys->written_to_all_lsn),
- ut_dulint_get_high(log_sys->lsn),
- ut_dulint_get_low(log_sys->lsn));
+ (ulong) ut_dulint_get_high(log_sys->written_to_all_lsn),
+ (ulong) ut_dulint_get_low(log_sys->written_to_all_lsn),
+ (ulong) ut_dulint_get_high(log_sys->lsn),
+ (ulong) ut_dulint_get_low(log_sys->lsn));
}
log_sys->n_pending_writes++;
@@ -1567,7 +1567,8 @@ log_io_complete_checkpoint(
log_sys->n_pending_checkpoint_writes--;
if (log_debug_writes) {
- printf("Checkpoint info written to group %lu\n", group->id);
+ printf("Checkpoint info written to group %lu\n",
+ (ulong) group->id);
}
if (log_sys->n_pending_checkpoint_writes == 0) {
@@ -1722,7 +1723,7 @@ log_group_checkpoint(
OS_FILE_LOG_BLOCK_SIZE,
buf, ((byte*)group + 1));
- ut_ad(((ulint)group & 0x1) == 0);
+ ut_ad(((ulint)group & 0x1UL) == 0);
}
}
@@ -1896,9 +1897,9 @@ log_checkpoint(
if (log_debug_writes) {
printf("Making checkpoint no %lu at lsn %lu %lu\n",
- ut_dulint_get_low(log_sys->next_checkpoint_no),
- ut_dulint_get_high(oldest_lsn),
- ut_dulint_get_low(oldest_lsn));
+ (ulong) ut_dulint_get_low(log_sys->next_checkpoint_no),
+ (ulong) ut_dulint_get_high(oldest_lsn),
+ (ulong) ut_dulint_get_low(oldest_lsn));
}
log_groups_write_checkpoint_info();
@@ -2125,9 +2126,11 @@ log_archived_file_name_gen(
ulint id, /* in: group id */
ulint file_no)/* in: file number */
{
+ ut_a(0);
+
UT_NOT_USED(id); /* Currently we only archive the first group */
- sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, file_no);
+ sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
}
/**********************************************************
@@ -2147,6 +2150,8 @@ log_group_archive_file_header_write(
ulint dest_offset;
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -2186,6 +2191,8 @@ log_group_archive_completed_header_write(
ulint dest_offset;
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
ut_a(nth_file < group->n_files);
@@ -2227,6 +2234,8 @@ log_group_archive(
ulint open_mode;
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -2258,7 +2267,6 @@ loop:
log_archived_file_name_gen(name, group->id,
group->archived_file_no + n_files);
- fil_reserve_right_to_open();
file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
OS_DATA_FILE, &ret);
@@ -2269,10 +2277,10 @@ loop:
}
if (!ret) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Cannot create or open archive log file %s.\n",
name);
- fprintf(stderr, "InnoDB: Cannot continue operation.\n"
+ fprintf(stderr, "InnoDB: Cannot continue operation.\n"
"InnoDB: Check that the log archive directory exists,\n"
"InnoDB: you have access rights to it, and\n"
"InnoDB: there is space available.\n");
@@ -2287,12 +2295,10 @@ loop:
ut_a(ret);
- fil_release_right_to_open();
-
/* Add the archive file as a node to the space */
fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
- group->archive_space_id);
+ group->archive_space_id, FALSE);
if (next_offset % group->file_size == 0) {
log_group_archive_file_header_write(group, n_files,
@@ -2313,9 +2319,9 @@ loop:
if (log_debug_writes) {
printf(
"Archiving starting at lsn %lu %lu, len %lu to group %lu\n",
- ut_dulint_get_high(start_lsn),
- ut_dulint_get_low(start_lsn),
- len, group->id);
+ (ulong) ut_dulint_get_high(start_lsn),
+ (ulong) ut_dulint_get_low(start_lsn),
+ (ulong) len, (ulong) group->id);
}
log_sys->n_pending_archive_ios++;
@@ -2357,6 +2363,8 @@ log_archive_groups(void)
log_group_t* group;
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -2382,6 +2390,8 @@ log_archive_write_complete_groups(void)
ulint i;
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -2409,7 +2419,7 @@ log_archive_write_complete_groups(void)
if (log_debug_writes && trunc_files) {
printf("Complete file(s) archived to group %lu\n",
- group->id);
+ (ulong) group->id);
}
/* Calculate the archive file space start lsn */
@@ -2446,6 +2456,8 @@ log_archive_check_completion_low(void)
/*==================================*/
{
#ifdef UNIV_SYNC_DEBUG
+ ut_a(0);
+
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
@@ -2483,6 +2495,8 @@ log_io_complete_archive(void)
{
log_group_t* group;
+ ut_a(0);
+
mutex_enter(&(log_sys->mutex));
group = UT_LIST_GET_FIRST(log_sys->log_groups);
@@ -2518,6 +2532,8 @@ log_archive_do(
dulint start_lsn;
dulint limit_lsn;
+ ut_a(0);
+
calc_new_limit = TRUE;
loop:
mutex_enter(&(log_sys->mutex));
@@ -2544,7 +2560,7 @@ loop:
start_lsn = log_sys->archived_lsn;
if (calc_new_limit) {
- ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE
limit_lsn = ut_dulint_add(start_lsn,
log_sys->archive_buf_size);
@@ -2600,10 +2616,10 @@ loop:
if (log_debug_writes) {
printf("Archiving from lsn %lu %lu to lsn %lu %lu\n",
- ut_dulint_get_high(log_sys->archived_lsn),
- ut_dulint_get_low(log_sys->archived_lsn),
- ut_dulint_get_high(limit_lsn),
- ut_dulint_get_low(limit_lsn));
+ (ulong) ut_dulint_get_high(log_sys->archived_lsn),
+ (ulong) ut_dulint_get_low(log_sys->archived_lsn),
+ (ulong) ut_dulint_get_high(limit_lsn),
+ (ulong) ut_dulint_get_low(limit_lsn));
}
/* Read the log segment to the archive buffer */
@@ -2643,6 +2659,8 @@ log_archive_all(void)
return;
}
+ ut_a(0);
+
present_lsn = log_sys->lsn;
mutex_exit(&(log_sys->mutex));
@@ -2682,11 +2700,17 @@ log_archive_close_groups(
ut_ad(mutex_own(&(log_sys->mutex)));
#endif /* UNIV_SYNC_DEBUG */
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+ return;
+ }
+
+ ut_a(0);
+
group = UT_LIST_GET_FIRST(log_sys->log_groups);
trunc_len = UNIV_PAGE_SIZE
* fil_space_get_size(group->archive_space_id);
-
if (trunc_len > 0) {
ut_a(trunc_len == group->file_size);
@@ -2706,7 +2730,8 @@ log_archive_close_groups(
if (log_debug_writes) {
printf(
"Incrementing arch file no to %lu in log group %lu\n",
- group->archived_file_no + 2, group->id);
+ (ulong) group->archived_file_no + 2,
+ (ulong) group->id);
}
}
}
@@ -2714,17 +2739,18 @@ log_archive_close_groups(
/********************************************************************
Writes the log contents to the archive up to the lsn when this function was
called, and stops the archiving. When archiving is started again, the archived
-log file numbers start from 2 higher, so that the archiving will
-not write again to the archived log files which exist when this function
-returns. */
+log file numbers start from 2 higher, so that the archiving will not write
+again to the archived log files which exist when this function returns. */
ulint
log_archive_stop(void)
/*==================*/
- /* out: DB_SUCCESS or DB_ERROR */
+ /* out: DB_SUCCESS or DB_ERROR */
{
ibool success;
+ ut_a(0);
+
mutex_enter(&(log_sys->mutex));
if (log_sys->archiving_state != LOG_ARCH_ON) {
@@ -2737,7 +2763,7 @@ log_archive_stop(void)
log_sys->archiving_state = LOG_ARCH_STOPPING;
mutex_exit(&(log_sys->mutex));
-
+
log_archive_all();
mutex_enter(&(log_sys->mutex));
@@ -2758,7 +2784,7 @@ log_archive_stop(void)
if appropriate */
log_archive_close_groups(TRUE);
-
+
mutex_exit(&(log_sys->mutex));
/* Make a checkpoint, so that if recovery is needed, the file numbers
@@ -2787,6 +2813,8 @@ log_archive_start(void)
/*===================*/
/* out: DB_SUCCESS or DB_ERROR */
{
+ ut_a(0);
+
mutex_enter(&(log_sys->mutex));
if (log_sys->archiving_state != LOG_ARCH_STOPPED) {
@@ -2813,6 +2841,7 @@ log_archive_noarchivelog(void)
/*==========================*/
/* out: DB_SUCCESS or DB_ERROR */
{
+ ut_a(0);
loop:
mutex_enter(&(log_sys->mutex));
@@ -2845,6 +2874,7 @@ log_archive_archivelog(void)
/*========================*/
/* out: DB_SUCCESS or DB_ERROR */
{
+ ut_a(0);
mutex_enter(&(log_sys->mutex));
if (log_sys->archiving_state == LOG_ARCH_OFF) {
@@ -2852,7 +2882,7 @@ log_archive_archivelog(void)
log_sys->archiving_state = LOG_ARCH_ON;
log_sys->archived_lsn = ut_dulint_align_down(log_sys->lsn,
- OS_FILE_LOG_BLOCK_SIZE);
+ OS_FILE_LOG_BLOCK_SIZE);
mutex_exit(&(log_sys->mutex));
return(DB_SUCCESS);
@@ -2863,6 +2893,7 @@ log_archive_archivelog(void)
return(DB_ERROR);
}
+#ifdef notdefined
/********************************************************************
Tries to establish a big enough margin of free space in the log groups, such
that a new log entry can be catenated without an immediate need for
@@ -2916,6 +2947,7 @@ loop:
goto loop;
}
}
+#endif
/************************************************************************
Checks that there is enough free space in the log to start a new query step.
@@ -2932,7 +2964,7 @@ loop:
log_checkpoint_margin();
- log_archive_margin();
+ /* log_archive_margin(); */
mutex_enter(&(log_sys->mutex));
@@ -3070,7 +3102,7 @@ loop:
goto loop;
}
- log_archive_all();
+ /* log_archive_all(); */
log_make_checkpoint_at(ut_dulint_max, TRUE);
mutex_enter(&(log_sys->mutex));
@@ -3088,15 +3120,16 @@ loop:
goto loop;
}
- arch_log_no =
+ arch_log_no = 0;
+/*
UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
arch_log_no--;
}
-
- log_archive_close_groups(TRUE);
+*/
+ /* log_archive_close_groups(TRUE); */
mutex_exit(&(log_sys->mutex));
@@ -3145,10 +3178,24 @@ loop:
ut_a(buf_all_freed());
ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn));
+ if (ut_dulint_cmp(lsn, srv_start_lsn) < 0) {
+ fprintf(stderr,
+"InnoDB: Error: log sequence number at shutdown %lu %lu\n"
+"InnoDB: is lower than at startup %lu %lu!\n",
+ (ulong) ut_dulint_get_high(lsn),
+ (ulong) ut_dulint_get_low(lsn),
+ (ulong) ut_dulint_get_high(srv_start_lsn),
+ (ulong) ut_dulint_get_low(srv_start_lsn));
+ }
+
+ srv_shutdown_lsn = lsn;
+
fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
fil_flush_file_spaces(FIL_TABLESPACE);
+ fil_close_all_files();
+
/* Make some checks that the server really is quiet */
ut_a(srv_n_threads_active[SRV_MASTER] == 0);
ut_a(buf_all_freed());
@@ -3192,8 +3239,8 @@ log_check_log_recs(
ut_memcpy(scan_buf, start, end - start);
recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size() -
- RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames -
+ recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
FALSE, scan_buf, end - start,
ut_dulint_align_down(buf_start_lsn,
OS_FILE_LOG_BLOCK_SIZE),
@@ -3252,12 +3299,12 @@ log_print(
buf += sprintf(buf, "Log sequence number %lu %lu\n"
"Log flushed up to %lu %lu\n"
"Last checkpoint at %lu %lu\n",
- ut_dulint_get_high(log_sys->lsn),
- ut_dulint_get_low(log_sys->lsn),
- ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
- ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
- ut_dulint_get_high(log_sys->last_checkpoint_lsn),
- ut_dulint_get_low(log_sys->last_checkpoint_lsn));
+ (ulong) ut_dulint_get_high(log_sys->lsn),
+ (ulong) ut_dulint_get_low(log_sys->lsn),
+ (ulong) ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
+ (ulong) ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
+ (ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn),
+ (ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn));
current_time = time(NULL);
@@ -3266,10 +3313,10 @@ log_print(
buf += sprintf(buf,
"%lu pending log writes, %lu pending chkp writes\n"
"%lu log i/o's done, %.2f log i/o's/second\n",
- log_sys->n_pending_writes,
- log_sys->n_pending_checkpoint_writes,
- log_sys->n_log_ios,
- (log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed);
+ (ulong) log_sys->n_pending_writes,
+ (ulong) log_sys->n_pending_checkpoint_writes,
+ (ulong) log_sys->n_log_ios,
+ ((log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed));
log_sys->n_log_ios_old = log_sys->n_log_ios;
log_sys->last_printout_time = current_time;
diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c
index 323d6c63f71..f80181cc207 100644
--- a/innobase/log/log0recv.c
+++ b/innobase/log/log0recv.c
@@ -17,6 +17,7 @@ Created 9/20/1997 Heikki Tuuri
#include "buf0flu.h"
#include "buf0rea.h"
#include "srv0srv.h"
+#include "srv0start.h"
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "page0page.h"
@@ -33,6 +34,11 @@ Created 9/20/1997 Heikki Tuuri
#include "dict0boot.h"
#include "fil0fil.h"
+/* This is set to FALSE if the backup was originally taken with the
+ibbackup --include regexp option: then we do not want to create tables in
+directories which were not included */
+ibool recv_replay_file_ops = TRUE;
+
/* Log records are stored in the hash table in chunks at most of this size;
this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */
#define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t))
@@ -73,6 +79,13 @@ ulint recv_previous_parsed_rec_is_multi = 0;
ulint recv_max_parsed_page_no = 0;
+/* This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+
+ulint recv_n_pool_free_frames = 256;
+
/* The maximum lsn we see for a page during the recovery process. If this
is bigger than the lsn we are able to scan up to, that is an indication that
the recovery failed and the database may be corrupt. */
@@ -159,7 +172,8 @@ recv_sys_empty_hash(void)
fprintf(stderr,
"InnoDB: Error: %lu pages with log records were left unprocessed!\n"
"InnoDB: Maximum page number with log records on it %lu\n",
- recv_sys->n_addrs, recv_max_parsed_page_no);
+ (ulong) recv_sys->n_addrs,
+ (ulong) recv_max_parsed_page_no);
ut_error;
}
@@ -300,7 +314,8 @@ recv_copy_group(
/*============*/
log_group_t* up_to_date_group, /* in: the most up-to-date log
group */
- log_group_t* group, /* in: copy to this log group */
+ log_group_t* group, /* in: copy to this log
+ group */
dulint recovered_lsn) /* in: recovery succeeded up
to this lsn */
{
@@ -366,7 +381,8 @@ recv_synchronize_groups(
/* Read the last recovered log block to the recovery system buffer:
the block is always incomplete */
- start_lsn = ut_dulint_align_down(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE);
+ start_lsn = ut_dulint_align_down(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
end_lsn = ut_dulint_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE);
ut_a(ut_dulint_cmp(start_lsn, end_lsn) != 0);
@@ -422,7 +438,7 @@ recv_check_cp_is_consistent(
fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
- if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf
+ if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf
+ LOG_CHECKPOINT_CHECKSUM_1)) {
return(FALSE);
}
@@ -430,7 +446,7 @@ recv_check_cp_is_consistent(
fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
- if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf
+ if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf
+ LOG_CHECKPOINT_CHECKSUM_2)) {
return(FALSE);
}
@@ -474,8 +490,9 @@ recv_find_max_checkpoint(
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Checkpoint in group %lu at %lu invalid, %lu\n",
- group->id, field,
- mach_read_from_4(buf
+ (ulong) group->id,
+ (ulong) field,
+ (ulong) mach_read_from_4(buf
+ LOG_CHECKPOINT_CHECKSUM_1));
}
@@ -495,7 +512,8 @@ recv_find_max_checkpoint(
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Checkpoint number %lu found in group %lu\n",
- ut_dulint_get_low(checkpoint_no), group->id);
+ (ulong) ut_dulint_get_low(checkpoint_no),
+ (ulong) group->id);
}
if (ut_dulint_cmp(checkpoint_no, max_no) >= 0) {
@@ -537,8 +555,8 @@ recv_read_cp_info_for_backup(
byte* hdr, /* in: buffer containing the log group header */
dulint* lsn, /* out: checkpoint lsn */
ulint* offset, /* out: checkpoint offset in the log group */
- ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database
- is running with < version 3.23.50 of InnoDB */
+ ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the
+ database is running with < version 3.23.50 of InnoDB */
dulint* cp_no, /* out: checkpoint number */
dulint* first_header_lsn)
/* out: lsn of of the start of the first log file */
@@ -683,7 +701,7 @@ recv_scan_log_seg_for_backup(
< *scanned_checkpoint_no
&& *scanned_checkpoint_no
- log_block_get_checkpoint_no(log_block)
- > 0x80000000) {
+ > 0x80000000UL) {
/* Garbage from a log buffer flush which was made
before the most recent database recovery */
@@ -715,7 +733,7 @@ recv_scan_log_seg_for_backup(
/***********************************************************************
Tries to parse a single log record body and also applies it to a page if
-specified. */
+specified. File ops are parsed, but not applied in this function. */
static
byte*
recv_parse_or_apply_log_rec_body(
@@ -792,8 +810,14 @@ recv_parse_or_apply_log_rec_body(
} else if (type == MLOG_INIT_FILE_PAGE) {
new_ptr = fsp_parse_init_file_page(ptr, end_ptr, page);
- } else if (type <= MLOG_WRITE_STRING) {
+ } else if (type == MLOG_WRITE_STRING) {
new_ptr = mlog_parse_string(ptr, end_ptr, page);
+
+ } else if (type == MLOG_FILE_CREATE
+ || type == MLOG_FILE_RENAME
+ || type == MLOG_FILE_DELETE) {
+ new_ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE,
+ ULINT_UNDEFINED);
} else {
new_ptr = NULL;
@@ -880,9 +904,14 @@ recv_add_to_hash_table(
recv_data_t* recv_data;
recv_data_t** prev_field;
recv_addr_t* recv_addr;
-
- ut_a(space == 0); /* For debugging; TODO: remove this */
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) {
+ /* The tablespace does not exist any more: do not store the
+ log record */
+
+ return;
+ }
+
len = rec_end - body;
recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t));
@@ -905,6 +934,9 @@ recv_add_to_hash_table(
HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash,
recv_fold(space, page_no), recv_addr);
recv_sys->n_addrs++;
+
+ /* printf("Inserting log rec for space %lu, page %lu\n",
+ space, page_no); */
}
UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv);
@@ -1021,6 +1053,8 @@ recv_recover_page(
return;
}
+ /* printf("Recovering space %lu, page %lu\n", space, page_no); */
+
recv_addr->state = RECV_BEING_PROCESSED;
mutex_exit(&(recv_sys->mutex));
@@ -1116,8 +1150,9 @@ recv_recover_page(
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Applying log rec type %lu len %lu to space %lu page no %lu\n",
- (ulint)recv->type, recv->len, recv_addr->space,
- recv_addr->page_no);
+ (ulong) recv->type, (ulong) recv->len,
+ (ulong) recv_addr->space,
+ (ulong) recv_addr->page_no);
}
recv_parse_or_apply_log_rec_body(recv->type, buf,
@@ -1308,8 +1343,7 @@ loop:
/ hash_get_n_cells(recv_sys->addr_hash)) {
fprintf(stderr, "%lu ",
- (i * 100) / hash_get_n_cells(recv_sys->addr_hash));
-
+ (ulong) ((i * 100) / hash_get_n_cells(recv_sys->addr_hash)));
}
}
@@ -1363,130 +1397,132 @@ loop:
}
#ifdef UNIV_HOTBACKUP
+/* This page is allocated from the buffer pool and used in the function
+below */
+page_t* recv_backup_application_page = NULL;
+
/***********************************************************************
Applies log records in the hash table to a backup. */
void
-recv_apply_log_recs_for_backup(
-/*===========================*/
- ulint n_data_files, /* in: number of data files */
- char** data_files, /* in: array containing the paths to the
- data files */
- ulint* file_sizes) /* in: sizes of the data files in database
- pages */
+recv_apply_log_recs_for_backup(void)
+/*================================*/
{
recv_addr_t* recv_addr;
- os_file_t data_file;
- ulint n_pages_total = 0;
- ulint nth_file = 0;
- ulint nth_page_in_file= 0;
+ ulint n_hash_cells;
byte* page;
+ ulint actual_size;
ibool success;
+ ulint error;
ulint i;
recv_sys->apply_log_recs = TRUE;
recv_sys->apply_batch_on = TRUE;
- page = buf_pool->frame_zero;
-
- for (i = 0; i < n_data_files; i++) {
- n_pages_total += file_sizes[i];
+ if (recv_backup_application_page == NULL) {
+ recv_backup_application_page = buf_frame_alloc();
}
- if (recv_max_parsed_page_no >= n_pages_total) {
- printf(
-"InnoDB: Error: tablespace size %lu pages, but a log record on page %lu!\n"
-"InnoDB: Are you sure you have specified all the ibdata files right in\n"
-"InnoDB: the my.cnf file you gave as the argument to ibbackup --restore?\n",
- n_pages_total, recv_max_parsed_page_no);
- }
+ page = recv_backup_application_page;
printf(
"InnoDB: Starting an apply batch of log records to the database...\n"
"InnoDB: Progress in percents: ");
- for (i = 0; i < n_pages_total; i++) {
+ n_hash_cells = hash_get_n_cells(recv_sys->addr_hash);
- if (i == 0 || nth_page_in_file == file_sizes[nth_file]) {
- if (i != 0) {
- nth_file++;
- nth_page_in_file = 0;
- os_file_flush(data_file);
- os_file_close(data_file);
- }
+ for (i = 0; i < n_hash_cells; i++) {
+ /* The address hash table is externally chained */
+ recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node;
- data_file = os_file_create_simple(data_files[nth_file],
- OS_FILE_OPEN,
- OS_FILE_READ_WRITE,
- &success);
- if (!success) {
+ while (recv_addr != NULL) {
+
+ if (!fil_tablespace_exists_in_mem(recv_addr->space)) {
+/*
printf(
-"InnoDB: Error: cannot open %lu'th data file\n", nth_file);
+"InnoDB: Warning: cannot apply log record to tablespace %lu page %lu,\n"
+"InnoDB: because tablespace with that id does not exist.\n",
+ recv_addr->space, recv_addr->page_no);
+*/
+ recv_addr->state = RECV_PROCESSED;
- exit(1);
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+
+ goto skip_this_recv_addr;
}
- }
-
- recv_addr = recv_get_fil_addr_struct(0, i);
-
- if (recv_addr != NULL) {
- success = os_file_read(data_file, page,
- (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT)
- & 0xFFFFFFFF,
- nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT),
- UNIV_PAGE_SIZE);
+
+ /* We simulate a page read made by the buffer pool, to
+ make sure the recovery apparatus works ok, for
+ example, the buf_frame_align() function. We must init
+ the block corresponding to buf_pool->frame_zero
+ (== page). */
+
+ buf_page_init_for_backup_restore(recv_addr->space,
+ recv_addr->page_no,
+ buf_block_align(page));
+
+ /* Extend the tablespace's last file if the page_no
+ does not fall inside its bounds; we assume the last
+ file is auto-extending, and ibbackup copied the file
+ when it still was smaller */
+
+ success = fil_extend_space_to_desired_size(
+ &actual_size,
+ recv_addr->space,
+ recv_addr->page_no + 1);
if (!success) {
printf(
-"InnoDB: Error: cannot read page no %lu from %lu'th data file\n",
- nth_page_in_file, nth_file);
+"InnoDB: Fatal error: cannot extend tablespace %lu to hold %lu pages\n",
+ recv_addr->space, recv_addr->page_no);
+
+ exit(1);
+ }
+ /* Read the page from the tablespace file using the
+ fil0fil.c routines */
+
+ error = fil_io(OS_FILE_READ, TRUE, recv_addr->space,
+ recv_addr->page_no, 0, UNIV_PAGE_SIZE,
+ page, NULL);
+ if (error != DB_SUCCESS) {
+ printf(
+"InnoDB: Fatal error: cannot read from tablespace %lu page number %lu\n",
+ recv_addr->space, recv_addr->page_no);
+
exit(1);
}
-
- /* We simulate a page read made by the buffer pool,
- to make sure recovery works ok. We must init the
- block corresponding to buf_pool->frame_zero
- (== page) */
- buf_page_init_for_backup_restore(0, i,
- buf_block_align(page));
+ /* Apply the log records to this page */
+ recv_recover_page(TRUE, FALSE, page, recv_addr->space,
+ recv_addr->page_no);
- recv_recover_page(TRUE, FALSE, page, 0, i);
+ /* Write the page back to the tablespace file using the
+ fil0fil.c routines */
buf_flush_init_for_writing(page,
mach_read_from_8(page + FIL_PAGE_LSN),
- 0, i);
-
- success = os_file_write(data_files[nth_file],
- data_file, page,
- (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT)
- & 0xFFFFFFFF,
- nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT),
- UNIV_PAGE_SIZE);
- if (!success) {
- printf(
-"InnoDB: Error: cannot write page no %lu to %lu'th data file\n",
- nth_page_in_file, nth_file);
+ recv_addr->space, recv_addr->page_no);
- exit(1);
- }
+ error = fil_io(OS_FILE_WRITE, TRUE, recv_addr->space,
+ recv_addr->page_no, 0, UNIV_PAGE_SIZE,
+ page, NULL);
+skip_this_recv_addr:
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
}
- if ((100 * i) / n_pages_total
- != (100 * (i + 1)) / n_pages_total) {
- printf("%lu ", (100 * i) / n_pages_total);
+ if ((100 * i) / n_hash_cells
+ != (100 * (i + 1)) / n_hash_cells) {
+ printf("%lu ", (100 * i) / n_hash_cells);
fflush(stdout);
}
-
- nth_page_in_file++;
}
-
- os_file_flush(data_file);
- os_file_close(data_file);
recv_sys_empty_hash();
}
+#endif
+#ifdef notdefined
/***********************************************************************
In the debug version, updates the replica of a file page, based on a log
record. */
@@ -1720,7 +1756,7 @@ recv_parse_log_rec(
if (*ptr == MLOG_DUMMY_RECORD) {
*type = *ptr;
- *space = 1000; /* For debugging */
+ *space = ULINT_UNDEFINED - 1; /* For debugging */
return(1);
}
@@ -1732,9 +1768,9 @@ recv_parse_log_rec(
return(0);
}
- /* Check that space id and page_no are sensible */
+ /* Check that page_no is sensible */
- if (*space != 0 || *page_no > 0x8FFFFFFF) {
+ if (*page_no > 0x8FFFFFFFUL) {
recv_sys->found_corrupt_log = TRUE;
@@ -1823,19 +1859,19 @@ recv_report_corrupt_log(
"InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
"InnoDB: Log record type %lu, space id %lu, page number %lu\n"
"InnoDB: Log parsing proceeded successfully up to %lu %lu\n",
- (ulint)type, space, page_no,
- ut_dulint_get_high(recv_sys->recovered_lsn),
- ut_dulint_get_low(recv_sys->recovered_lsn));
+ (ulong) type, (ulong) space, (ulong) page_no,
+ (ulong) ut_dulint_get_high(recv_sys->recovered_lsn),
+ (ulong) ut_dulint_get_low(recv_sys->recovered_lsn));
err_buf = ut_malloc(1000000);
fprintf(stderr,
"InnoDB: Previous log record type %lu, is multi %lu\n"
"InnoDB: Recv offset %lu, prev %lu\n",
- recv_previous_parsed_rec_type,
- recv_previous_parsed_rec_is_multi,
- (ulint)(ptr - recv_sys->buf),
- recv_previous_parsed_rec_offset);
+ (ulong) recv_previous_parsed_rec_type,
+ (ulong) recv_previous_parsed_rec_is_multi,
+ (ulong) (ptr - recv_sys->buf),
+ (ulong) recv_previous_parsed_rec_offset);
if ((ulint)(ptr - recv_sys->buf + 100)
> recv_previous_parsed_rec_offset
@@ -1910,12 +1946,16 @@ loop:
single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG;
if (single_rec || *ptr == MLOG_DUMMY_RECORD) {
- /* The mtr only modified a single page */
+ /* The mtr only modified a single page, or this is a file op */
old_lsn = recv_sys->recovered_lsn;
+ /* Try to parse a log record, fetching its type, space id,
+ page no, and a pointer to the body of the log record */
+
len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
&page_no, &body);
+
if (len == 0 || recv_sys->found_corrupt_log) {
if (recv_sys->found_corrupt_log) {
@@ -1947,12 +1987,36 @@ loop:
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Parsed a single log rec type %lu len %lu space %lu page no %lu\n",
- (ulint)type, len, space, page_no);
+ (ulong) type, (ulong) len, (ulong) space,
+ (ulong) page_no);
}
if (type == MLOG_DUMMY_RECORD) {
/* Do nothing */
+ } else if (store_to_hash && (type == MLOG_FILE_CREATE
+ || type == MLOG_FILE_RENAME
+ || type == MLOG_FILE_DELETE)) {
+#ifdef UNIV_HOTBACKUP
+ if (recv_replay_file_ops) {
+
+ /* In ibbackup --apply-log, replay an .ibd file
+ operation, if possible; note that
+ fil_path_to_mysql_datadir is set in ibbackup to
+ point to the datadir we should use there */
+
+ if (NULL == fil_op_log_parse_or_replay(body,
+ end_ptr, type, TRUE, space)) {
+ fprintf(stderr,
+"InnoDB: Error: file op log record of type %lu space %lu not complete in\n"
+"InnoDB: the replay phase. Path %s\n", (ulint)type, space, (char*)(body + 2));
+
+ ut_a(0);
+ }
+ }
+#endif
+ /* In normal mysqld crash recovery we do not try to
+ replay file operations */
} else if (store_to_hash) {
recv_add_to_hash_table(type, space, page_no, body,
ptr + len, old_lsn,
@@ -2010,7 +2074,8 @@ loop:
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Parsed a multi log rec type %lu len %lu space %lu page no %lu\n",
- (ulint)type, len, space, page_no);
+ (ulong) type, (ulong) len, (ulong) space,
+ (ulong) page_no);
}
total_len += len;
@@ -2240,10 +2305,11 @@ recv_scan_log_recs(
fprintf(stderr,
"InnoDB: Log block no %lu at lsn %lu %lu has\n"
"InnoDB: ok header, but checksum field contains %lu, should be %lu\n",
- no, ut_dulint_get_high(scanned_lsn),
- ut_dulint_get_low(scanned_lsn),
- log_block_get_checksum(log_block),
- log_block_calc_checksum(log_block));
+ (ulong) no,
+ (ulong) ut_dulint_get_high(scanned_lsn),
+ (ulong) ut_dulint_get_low(scanned_lsn),
+ (ulong) log_block_get_checksum(log_block),
+ (ulong) log_block_calc_checksum(log_block));
}
/* Garbage or an incompletely written log block */
@@ -2276,7 +2342,7 @@ recv_scan_log_recs(
< recv_sys->scanned_checkpoint_no)
&& (recv_sys->scanned_checkpoint_no
- log_block_get_checkpoint_no(log_block)
- > 0x80000000)) {
+ > 0x80000000UL)) {
/* Garbage from a log buffer flush which was made
before the most recent database recovery */
@@ -2309,7 +2375,8 @@ recv_scan_log_recs(
if (ut_dulint_cmp(scanned_lsn, recv_sys->scanned_lsn) > 0) {
/* We were able to find more log data: add it to the
- parsing buffer if parse_start_lsn is already non-zero */
+ parsing buffer if parse_start_lsn is already
+ non-zero */
if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE
>= RECV_PARSING_BUF_SIZE) {
@@ -2347,8 +2414,8 @@ recv_scan_log_recs(
fprintf(stderr,
"InnoDB: Doing recovery: scanned up to log sequence number %lu %lu\n",
- ut_dulint_get_high(*group_scanned_lsn),
- ut_dulint_get_low(*group_scanned_lsn));
+ (ulong) ut_dulint_get_high(*group_scanned_lsn),
+ (ulong) ut_dulint_get_low(*group_scanned_lsn));
}
}
@@ -2407,8 +2474,8 @@ recv_group_scan_log_recs(
group, start_lsn, end_lsn);
finished = recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size()
- - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames
+ - recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
TRUE, log_sys->buf,
RECV_SCAN_SIZE, start_lsn,
contiguous_lsn, group_scanned_lsn);
@@ -2418,9 +2485,9 @@ recv_group_scan_log_recs(
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Scanned group %lu up to log sequence number %lu %lu\n",
- group->id,
- ut_dulint_get_high(*group_scanned_lsn),
- ut_dulint_get_low(*group_scanned_lsn));
+ (ulong) group->id,
+ (ulong) ut_dulint_get_high(*group_scanned_lsn),
+ (ulong) ut_dulint_get_low(*group_scanned_lsn));
}
}
@@ -2458,7 +2525,6 @@ recv_recovery_from_checkpoint_start(
|| (ut_dulint_cmp(limit_lsn, ut_dulint_max) == 0));
if (type == LOG_CHECKPOINT) {
-
recv_sys_create();
recv_sys_init(FALSE, buf_pool_get_curr_size());
}
@@ -2472,8 +2538,6 @@ recv_recovery_from_checkpoint_start(
return(DB_SUCCESS);
}
- sync_order_checks_on = TRUE;
-
recv_recovery_on = TRUE;
recv_sys->limit_lsn = limit_lsn;
@@ -2546,25 +2610,72 @@ recv_recovery_from_checkpoint_start(
recv_sys->scanned_checkpoint_no = 0;
recv_sys->recovered_lsn = checkpoint_lsn;
- /* NOTE: we always do recovery at startup, but only if
+ srv_start_lsn = checkpoint_lsn;
+
+ /* NOTE: we always do a 'recovery' at startup, but only if
there is something wrong we will print a message to the
user about recovery: */
if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn) != 0
|| ut_dulint_cmp(checkpoint_lsn, min_flushed_lsn) != 0) {
+ if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn)
+ < 0) {
+ fprintf(stderr,
+"InnoDB: ##########################################################\n"
+"InnoDB: WARNING!\n"
+"InnoDB: The log sequence number in ibdata files is higher\n"
+"InnoDB: than the log sequence number in the ib_logfiles! Are you sure\n"
+"InnoDB: you are using the right ib_logfiles to start up the database?\n"
+"InnoDB: Log sequence number in ib_logfiles is %lu %lu, log\n"
+"InnoDB: sequence numbers stamped to ibdata file headers are between\n"
+"InnoDB: %lu %lu and %lu %lu.\n"
+"InnoDB: ##########################################################\n",
+ (ulong) ut_dulint_get_high(checkpoint_lsn),
+ (ulong) ut_dulint_get_low(checkpoint_lsn),
+ (ulong) ut_dulint_get_high(min_flushed_lsn),
+ (ulong) ut_dulint_get_low(min_flushed_lsn),
+ (ulong) ut_dulint_get_high(max_flushed_lsn),
+ (ulong) ut_dulint_get_low(max_flushed_lsn));
+ }
+
recv_needed_recovery = TRUE;
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Database was not shut down normally.\n"
- "InnoDB: Starting recovery from log files...\n");
+" InnoDB: Database was not shut down normally!\n"
+"InnoDB: Starting crash recovery.\n");
+
+ fprintf(stderr,
+"InnoDB: Reading tablespace information from the .ibd files...\n");
+
+ fil_load_single_table_tablespaces();
+
+ /* If we are using the doublewrite method, we will
+ check if there are half-written pages in data files,
+ and restore them from the doublewrite buffer if
+ possible */
+
+ if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+ fprintf(stderr,
+"InnoDB: Restoring possible half-written data pages from the doublewrite\n"
+"InnoDB: buffer...\n");
+ trx_sys_doublewrite_init_or_restore_pages(
+ TRUE);
+ }
+
+ ut_print_timestamp(stderr);
+
fprintf(stderr,
- "InnoDB: Starting log scan based on checkpoint at\n"
- "InnoDB: log sequence number %lu %lu\n",
- ut_dulint_get_high(checkpoint_lsn),
- ut_dulint_get_low(checkpoint_lsn));
+" InnoDB: Starting log scan based on checkpoint at\n"
+"InnoDB: log sequence number %lu %lu.\n",
+ (ulong) ut_dulint_get_high(checkpoint_lsn),
+ (ulong) ut_dulint_get_low(checkpoint_lsn));
+ } else {
+ /* Init the doublewrite buffer memory structure */
+ trx_sys_doublewrite_init_or_restore_pages(FALSE);
}
}
@@ -2645,10 +2756,10 @@ recv_recovery_from_checkpoint_start(
" InnoDB: ERROR: We were only able to scan the log up to\n"
"InnoDB: %lu %lu, but a checkpoint was at %lu %lu.\n"
"InnoDB: It is possible that the database is now corrupt!\n",
- ut_dulint_get_high(group_scanned_lsn),
- ut_dulint_get_low(group_scanned_lsn),
- ut_dulint_get_high(checkpoint_lsn),
- ut_dulint_get_low(checkpoint_lsn));
+ (ulong) ut_dulint_get_high(group_scanned_lsn),
+ (ulong) ut_dulint_get_low(group_scanned_lsn),
+ (ulong) ut_dulint_get_high(checkpoint_lsn),
+ (ulong) ut_dulint_get_low(checkpoint_lsn));
}
if (ut_dulint_cmp(group_scanned_lsn, recv_max_page_lsn) < 0) {
@@ -2657,10 +2768,10 @@ recv_recovery_from_checkpoint_start(
" InnoDB: ERROR: We were only able to scan the log up to %lu %lu\n"
"InnoDB: but a database page a had an lsn %lu %lu. It is possible that the\n"
"InnoDB: database is now corrupt!\n",
- ut_dulint_get_high(group_scanned_lsn),
- ut_dulint_get_low(group_scanned_lsn),
- ut_dulint_get_high(recv_max_page_lsn),
- ut_dulint_get_low(recv_max_page_lsn));
+ (ulong) ut_dulint_get_high(group_scanned_lsn),
+ (ulong) ut_dulint_get_low(group_scanned_lsn),
+ (ulong) ut_dulint_get_high(recv_max_page_lsn),
+ (ulong) ut_dulint_get_low(recv_max_page_lsn));
}
if (ut_dulint_cmp(recv_sys->recovered_lsn, checkpoint_lsn) < 0) {
@@ -2686,6 +2797,21 @@ recv_recovery_from_checkpoint_start(
log_sys->archived_lsn = archived_lsn;
recv_synchronize_groups(up_to_date_group);
+
+ if (!recv_needed_recovery) {
+ if (ut_dulint_cmp(checkpoint_lsn, recv_sys->recovered_lsn)
+ != 0) {
+ fprintf(stderr,
+"InnoDB: Warning: we did not need to do crash recovery, but log scan\n"
+"InnoDB: progressed past the checkpoint lsn %lu %lu up to lsn %lu %lu\n",
+ (ulong) ut_dulint_get_high(checkpoint_lsn),
+ (ulong) ut_dulint_get_low(checkpoint_lsn),
+ (ulong) ut_dulint_get_high(recv_sys->recovered_lsn),
+ (ulong) ut_dulint_get_low(recv_sys->recovered_lsn));
+ }
+ } else {
+ srv_start_lsn = recv_sys->recovered_lsn;
+ }
log_sys->lsn = recv_sys->recovered_lsn;
@@ -2714,8 +2840,6 @@ recv_recovery_from_checkpoint_start(
mutex_exit(&(log_sys->mutex));
- sync_order_checks_on = FALSE;
-
recv_lsn_checks_on = TRUE;
/* The database is now ready to start almost normal processing of user
@@ -2857,9 +2981,11 @@ recv_reset_log_files_for_backup(
buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+ memset(buf, LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE, '\0');
+
for (i = 0; i < n_log_files; i++) {
- sprintf(name, "%sib_logfile%lu", log_dir, i);
+ sprintf(name, "%sib_logfile%lu", log_dir, (ulong) i);
log_file = os_file_create_simple(name, OS_FILE_CREATE,
OS_FILE_READ_WRITE, &success);
@@ -2871,17 +2997,18 @@ recv_reset_log_files_for_backup(
}
printf(
-"Setting log file size to %lu %lu\n", ut_get_high32(log_file_size),
- log_file_size & 0xFFFFFFFF);
+"Setting log file size to %lu %lu\n", (ulong) ut_get_high32(log_file_size),
+ (ulong) (log_file_size & 0xFFFFFFFFUL));
success = os_file_set_size(name, log_file,
- log_file_size & 0xFFFFFFFF,
+ log_file_size & 0xFFFFFFFFUL,
ut_get_high32(log_file_size));
if (!success) {
printf(
-"InnoDB: Cannot set %s size to %lu %lu\n", name, ut_get_high32(log_file_size),
- log_file_size & 0xFFFFFFFF);
+"InnoDB: Cannot set %s size to %lu %lu\n", name,
+ (ulong) ut_get_high32(log_file_size),
+ (ulong) (log_file_size & 0xFFFFFFFFUL));
exit(1);
}
@@ -2896,7 +3023,7 @@ recv_reset_log_files_for_backup(
log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
LOG_BLOCK_HDR_SIZE);
- sprintf(name, "%sib_logfile%lu", log_dir, 0);
+ sprintf(name, "%sib_logfile%lu", log_dir, (ulong) 0);
log_file = os_file_create_simple(name, OS_FILE_OPEN,
OS_FILE_READ_WRITE, &success);
@@ -2938,6 +3065,8 @@ log_group_recover_from_archive_file(
int input_char;
char name[10000];
+ ut_a(0);
+
try_open_again:
buf = log_sys->buf;
@@ -2945,13 +3074,10 @@ try_open_again:
log_archived_file_name_gen(name, group->id, group->archived_file_no);
- fil_reserve_right_to_open();
-
file_handle = os_file_create(name, OS_FILE_OPEN,
OS_FILE_LOG, OS_FILE_AIO, &ret);
if (ret == FALSE) {
- fil_release_right_to_open();
ask_again:
fprintf(stderr,
"InnoDB: Do you want to copy additional archived log files\n"
@@ -2992,12 +3118,10 @@ ask_again:
ut_a(ret);
- fil_release_right_to_open();
-
/* Add the archive file as a node to the space */
fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
- group->archive_space_id);
+ group->archive_space_id, FALSE);
ut_a(RECV_SCAN_SIZE >= LOG_FILE_HDR_SIZE);
/* Read the archive file header */
@@ -3063,9 +3187,9 @@ ask_again:
if (log_debug_writes) {
fprintf(stderr,
"InnoDB: Archive read starting at lsn %lu %lu, len %lu from file %s\n",
- ut_dulint_get_high(start_lsn),
- ut_dulint_get_low(start_lsn),
- len, name);
+ (ulong) ut_dulint_get_high(start_lsn),
+ (ulong) ut_dulint_get_low(start_lsn),
+ (ulong) len, name);
}
fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE,
@@ -3073,8 +3197,8 @@ ask_again:
read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
ret = recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size() -
- RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames -
+ recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
TRUE, buf, len, start_lsn,
&dummy_lsn, &scanned_lsn);
@@ -3120,10 +3244,10 @@ recv_recovery_from_archive_start(
ibool ret;
ulint err;
+ ut_a(0);
+
recv_sys_create();
recv_sys_init(FALSE, buf_pool_get_curr_size());
-
- sync_order_checks_on = TRUE;
recv_recovery_on = TRUE;
recv_recovery_from_backup_on = TRUE;
@@ -3146,7 +3270,7 @@ recv_recovery_from_archive_start(
if (!group) {
fprintf(stderr,
"InnoDB: There is no log group defined with id %lu!\n",
- group_id);
+ (ulong) group_id);
return(DB_ERROR);
}
@@ -3210,8 +3334,6 @@ recv_recovery_from_archive_start(
mutex_exit(&(log_sys->mutex));
- sync_order_checks_on = FALSE;
-
return(DB_SUCCESS);
}
@@ -3222,6 +3344,8 @@ void
recv_recovery_from_archive_finish(void)
/*===================================*/
{
+ ut_a(0);
+
recv_recovery_from_checkpoint_finish();
recv_recovery_from_backup_on = FALSE;
diff --git a/innobase/mach/mach0data.c b/innobase/mach/mach0data.c
index 336ce106a75..ff7265b34f4 100644
--- a/innobase/mach/mach0data.c
+++ b/innobase/mach/mach0data.c
@@ -36,37 +36,37 @@ mach_parse_compressed(
flag = mach_read_from_1(ptr);
- if (flag < 0x80) {
+ if (flag < 0x80UL) {
*val = flag;
return(ptr + 1);
- } else if (flag < 0xC0) {
+ } else if (flag < 0xC0UL) {
if (end_ptr < ptr + 2) {
return(NULL);
}
- *val = mach_read_from_2(ptr) & 0x7FFF;
+ *val = mach_read_from_2(ptr) & 0x7FFFUL;
return(ptr + 2);
- } else if (flag < 0xE0) {
+ } else if (flag < 0xE0UL) {
if (end_ptr < ptr + 3) {
return(NULL);
}
- *val = mach_read_from_3(ptr) & 0x3FFFFF;
+ *val = mach_read_from_3(ptr) & 0x3FFFFFUL;
return(ptr + 3);
- } else if (flag < 0xF0) {
+ } else if (flag < 0xF0UL) {
if (end_ptr < ptr + 4) {
return(NULL);
}
- *val = mach_read_from_4(ptr) & 0x1FFFFFFF;
+ *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL;
return(ptr + 4);
} else {
- ut_ad(flag == 0xF0);
+ ut_ad(flag == 0xF0UL);
if (end_ptr < ptr + 5) {
return(NULL);
diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c
index 07f348ab82f..1007f8413b4 100644
--- a/innobase/mem/mem0dbg.c
+++ b/innobase/mem/mem0dbg.c
@@ -338,7 +338,7 @@ mem_hash_remove(
if (node == NULL) {
printf(
"Memory heap or buffer freed in %s line %lu did not exist.\n",
- file_name, line);
+ file_name, (ulong) line);
ut_error;
}
@@ -351,21 +351,23 @@ mem_hash_remove(
mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size,
NULL, NULL);
if (error) {
- printf("Inconsistency in memory heap or buffer n:o %lu created\n",
- node->nth_heap);
- printf("in %s line %lu and tried to free in %s line %lu.\n",
- node->file_name, node->line, file_name, line);
+ printf(
+"Inconsistency in memory heap or buffer n:o %lu created\n",
+ (ulong) node->nth_heap);
+ printf("in %s line %lu and tried to free in %s line %lu.\n",
+ node->file_name, (ulong) node->line,
+ file_name, (ulong) line);
- printf(
- "Hex dump of 400 bytes around memory heap first block start:\n");
+ printf(
+"Hex dump of 400 bytes around memory heap first block start:\n");
- ut_print_buf((byte*)(node->heap) - 200, 400);
+ ut_print_buf((byte*)(node->heap) - 200, 400);
- printf("\nDump of the mem heap:\n");
+ printf("\nDump of the mem heap:\n");
- mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size,
- NULL, NULL);
- ut_error;
+ mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
+ &size, NULL, NULL);
+ ut_error;
}
/* Free the memory occupied by the node struct */
@@ -447,6 +449,9 @@ mem_heap_validate_or_print(
if ((block->type == MEM_HEAP_BUFFER)
&& (mem_block_get_len(block) > UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+"InnoDB: Error: mem block %lx length %lu > UNIV_PAGE_SIZE\n", (ulong) block,
+ (ulong) mem_block_get_len(block));
/* error */
return;
@@ -486,6 +491,12 @@ mem_heap_validate_or_print(
mem_field_trailer_get_check(user_field)) {
/* error */
+ fprintf(stderr,
+"InnoDB: Error: block %lx mem field %lx len %lu\n"
+"InnoDB: header check field is %lx but trailer %lx\n", (ulint)block,
+ (ulint)field, len, check_field,
+ mem_field_trailer_get_check(user_field));
+
return;
}
@@ -505,6 +516,11 @@ mem_heap_validate_or_print(
if (field != (byte*)block + mem_block_get_free(block)) {
/* error */
+ fprintf(stderr,
+"InnoDB: Error: block %lx end of mem fields %lx\n"
+"InnoDB: but block free at %lx\n", (ulint)block, (ulint)field,
+ (ulint)((byte*)block + mem_block_get_free(block)));
+
return;
}
@@ -547,7 +563,8 @@ mem_heap_print(
&us_size, &phys_size, &n_blocks);
printf(
"\nheap type: %lu; size: user size %lu; physical size %lu; blocks %lu.\n",
- heap->type, us_size, phys_size, n_blocks);
+ (ulong) heap->type, (ulong) us_size,
+ (ulong) phys_size, (ulong) n_blocks);
ut_a(!error);
}
@@ -583,6 +600,10 @@ mem_heap_validate(
mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size,
&phys_size, &n_blocks);
+ if (error) {
+ mem_heap_print(heap);
+ }
+
ut_a(!error);
return(TRUE);
@@ -738,8 +759,8 @@ mem_analyze_corruption(
if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
fprintf(stderr,
"Mem block at - %lu, file %s, line %lu\n",
- dist, p + sizeof(ulint),
- *(ulint*)(p + 8 + sizeof(ulint)));
+ (ulong) dist, (p + sizeof(ulint)),
+ (ulong) (*(ulint*)(p + 8 + sizeof(ulint))));
break;
}
@@ -747,8 +768,8 @@ mem_analyze_corruption(
if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
fprintf(stderr,
"Freed mem block at - %lu, file %s, line %lu\n",
- dist, p + sizeof(ulint),
- *(ulint*)(p + 8 + sizeof(ulint)));
+ (ulong) dist, (p + sizeof(ulint)),
+ (ulong) (*(ulint*)(p + 8 + sizeof(ulint))));
break;
}
@@ -775,8 +796,8 @@ mem_analyze_corruption(
if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
fprintf(stderr,
"Mem block at + %lu, file %s, line %lu\n",
- dist, p + sizeof(ulint),
- *(ulint*)(p + 8 + sizeof(ulint)));
+ (ulong) dist, (p + sizeof(ulint)),
+ (ulong) (*(ulint*)(p + 8 + sizeof(ulint))));
break;
}
@@ -784,8 +805,8 @@ mem_analyze_corruption(
if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
fprintf(stderr,
"Freed mem block at + %lu, file %s, line %lu\n",
- dist, p + sizeof(ulint),
- *(ulint*)(p + 8 + sizeof(ulint)));
+ (ulong) dist, (p + sizeof(ulint)),
+ (ulong) (*(ulint*)(p + 8 + sizeof(ulint))));
break;
}
diff --git a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c
index 9a5d16cd4a2..4f1ac2bcd7c 100644
--- a/innobase/mem/mem0pool.c
+++ b/innobase/mem/mem0pool.c
@@ -281,7 +281,8 @@ mem_pool_fill_free_list(
fprintf(stderr,
" InnoDB: Error: mem pool free list %lu length is %lu\n"
"InnoDB: though the list is empty!\n",
- i + 1, UT_LIST_GET_LEN(pool->free_list[i + 1]));
+ (ulong) i + 1,
+ (ulong) UT_LIST_GET_LEN(pool->free_list[i + 1]));
}
ret = mem_pool_fill_free_list(i + 1, pool);
@@ -362,7 +363,7 @@ mem_area_alloc(
fprintf(stderr,
"InnoDB: Error: Removing element from mem pool free list %lu though the\n"
"InnoDB: element is not marked free!\n",
- n);
+ (ulong) n);
mem_analyze_corruption((byte*)area);
@@ -382,7 +383,7 @@ mem_area_alloc(
fprintf(stderr,
"InnoDB: Error: Removing element from mem pool free list %lu\n"
"InnoDB: though the list length is 0!\n",
- n);
+ (ulong) n);
mem_analyze_corruption((byte*)area);
ut_error;
@@ -506,7 +507,7 @@ mem_area_free(
fprintf(stderr,
"InnoDB: Error: Memory area size %lu, next area size %lu not a power of 2!\n"
"InnoDB: Possibly a memory overrun of the buffer being freed here.\n",
- size, next_size);
+ (ulong) size, (ulong) next_size);
mem_analyze_corruption((byte*)area);
ut_error;
@@ -605,8 +606,8 @@ mem_pool_validate(
}
}
- ut_a(free + pool->reserved == pool->size
- - (pool->size % MEM_AREA_MIN_SIZE));
+ ut_a(free + pool->reserved == pool->size);
+
mutex_exit(&(pool->mutex));
return(TRUE);
@@ -634,13 +635,13 @@ mem_pool_print_info(
fprintf(outfile,
"Free list length %lu for blocks of size %lu\n",
- UT_LIST_GET_LEN(pool->free_list[i]),
- ut_2_exp(i));
+ (ulong) UT_LIST_GET_LEN(pool->free_list[i]),
+ (ulong) ut_2_exp(i));
}
}
- fprintf(outfile, "Pool size %lu, reserved %lu.\n", pool->size,
- pool->reserved);
+ fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size,
+ (ulong) pool->reserved);
mutex_exit(&(pool->mutex));
}
diff --git a/innobase/mtr/mtr0log.c b/innobase/mtr/mtr0log.c
index 91ff588713d..5a4aaa2377d 100644
--- a/innobase/mtr/mtr0log.c
+++ b/innobase/mtr/mtr0log.c
@@ -58,7 +58,7 @@ mlog_write_initial_log_record(
if (ptr < buf_pool->frame_zero || ptr >= buf_pool->high_end) {
fprintf(stderr,
"InnoDB: Error: trying to write to a stray memory location %lx\n",
- (ulint)ptr);
+ (ulong) ptr);
ut_error;
}
@@ -171,13 +171,13 @@ mlog_parse_nbytes(
}
if (type == MLOG_1BYTE) {
- if (val > 0xFF) {
+ if (val > 0xFFUL) {
recv_sys->found_corrupt_log = TRUE;
return(NULL);
}
} else if (type == MLOG_2BYTES) {
- if (val > 0xFFFF) {
+ if (val > 0xFFFFUL) {
recv_sys->found_corrupt_log = TRUE;
return(NULL);
@@ -221,7 +221,7 @@ mlog_write_ulint(
if (ptr < buf_pool->frame_zero || ptr >= buf_pool->high_end) {
fprintf(stderr,
"InnoDB: Error: trying to write to a stray memory location %lx\n",
- (ulint)ptr);
+ (ulong) ptr);
ut_error;
}
@@ -268,7 +268,7 @@ mlog_write_dulint(
if (ptr < buf_pool->frame_zero || ptr >= buf_pool->high_end) {
fprintf(stderr,
"InnoDB: Error: trying to write to a stray memory location %lx\n",
- (ulint)ptr);
+ (ulong) ptr);
ut_error;
}
@@ -312,7 +312,7 @@ mlog_write_string(
if (ptr < buf_pool->frame_zero || ptr >= buf_pool->high_end) {
fprintf(stderr,
"InnoDB: Error: trying to write to a stray memory location %lx\n",
- (ulint)ptr);
+ (ulong) ptr);
ut_error;
}
ut_ad(ptr && mtr);
diff --git a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c
index b2d8d022f8c..ac1a638063d 100644
--- a/innobase/mtr/mtr0mtr.c
+++ b/innobase/mtr/mtr0mtr.c
@@ -263,11 +263,11 @@ mtr_first_to_modify_page_after_backup(
backup_lsn) <= 0) {
printf("Page %lu newest %lu backup %lu\n",
- block->offset,
- ut_dulint_get_low(
+ (ulong) block->offset,
+ (ulong) ut_dulint_get_low(
buf_frame_get_newest_modification(
block->frame)),
- ut_dulint_get_low(backup_lsn));
+ (ulong) ut_dulint_get_low(backup_lsn));
ret = TRUE;
}
@@ -517,6 +517,6 @@ mtr_print(
{
printf(
"Mini-transaction handle: memo size %lu bytes log size %lu bytes\n",
- dyn_array_get_data_size(&(mtr->memo)),
- dyn_array_get_data_size(&(mtr->log)));
+ (ulong) dyn_array_get_data_size(&(mtr->memo)),
+ (ulong) dyn_array_get_data_size(&(mtr->log)));
}
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index abcb2259e84..7c9272fa13f 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -11,6 +11,7 @@ Created 10/21/1995 Heikki Tuuri
#include "os0thread.h"
#include "ut0mem.h"
#include "srv0srv.h"
+#include "srv0start.h"
#include "fil0fil.h"
#include "buf0buf.h"
@@ -33,7 +34,7 @@ ulint os_innodb_umask = 0;
#endif
/* If the following is set to TRUE, we do not call os_file_flush in every
-os_file_write. We can set this TRUE if the doublewrite buffer is used. */
+os_file_write. We can set this TRUE when the doublewrite buffer is used. */
ibool os_do_not_call_flush_at_each_write = FALSE;
/* We use these mutexes to protect lseek + file i/o operation, if the
@@ -154,7 +155,6 @@ os_mutex_t os_file_count_mutex;
ulint os_file_n_pending_preads = 0;
ulint os_file_n_pending_pwrites = 0;
-
/***************************************************************************
Gets the operating system version. Currently works only on Windows. */
@@ -198,9 +198,12 @@ overwrite the error number). If the number is not known to this program,
the OS error number + 100 is returned. */
ulint
-os_file_get_last_error(void)
-/*========================*/
- /* out: error number, or OS error number + 100 */
+os_file_get_last_error(
+/*===================*/
+ /* out: error number, or OS error
+ number + 100 */
+ ibool report_all_errors) /* in: TRUE if we want an error message
+ printed of all errors */
{
ulint err;
@@ -208,25 +211,29 @@ os_file_get_last_error(void)
err = (ulint) GetLastError();
- if (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS) {
+ if (report_all_errors
+ || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
+
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Operating system error number %lu in a file operation.\n"
- "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n",
- err);
+ " InnoDB: Operating system error number %lu in a file operation.\n", (ulong) err);
if (err == ERROR_PATH_NOT_FOUND) {
- fprintf(stderr,
- "InnoDB: The error means the system cannot find the path specified.\n"
- "InnoDB: In installation you must create directories yourself, InnoDB\n"
- "InnoDB: does not create them.\n");
+ fprintf(stderr,
+ "InnoDB: The error means the system cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB, remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB does not create them.\n");
+ }
} else if (err == ERROR_ACCESS_DENIED) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: The error means mysqld does not have the access rights to\n"
"InnoDB: the directory. It may also be you have created a subdirectory\n"
"InnoDB: of the same name as a data file.\n");
} else {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: See section 13.2 at http://www.innodb.com/ibman.html\n"
"InnoDB: about operating system error numbers.\n");
}
@@ -246,30 +253,33 @@ os_file_get_last_error(void)
#else
err = (ulint) errno;
- if (err != ENOSPC && err != EEXIST) {
- ut_print_timestamp(stderr);
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST)) {
+ ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Operating system error number %lu in a file operation.\n"
- "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n",
- err);
+ " InnoDB: Operating system error number %lu in a file operation.\n", err);
if (err == ENOENT) {
- fprintf(stderr,
- "InnoDB: The error means the system cannot find the path specified.\n"
- "InnoDB: In installation you must create directories yourself, InnoDB\n"
- "InnoDB: does not create them.\n");
+ fprintf(stderr,
+ "InnoDB: The error means the system cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB, remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB does not create them.\n");
+ }
} else if (err == EACCES) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: The error means mysqld does not have the access rights to\n"
"InnoDB: the directory.\n");
} else {
- if (strerror((int)err) != NULL) {
+ if (strerror((int)err) != NULL) {
fprintf(stderr,
"InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err));
- }
+ }
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: See also section 13.2 at http://www.innodb.com/ibman.html\n"
"InnoDB: about operating system error numbers.\n");
}
@@ -309,7 +319,7 @@ os_file_handle_error(
UT_NOT_USED(file);
- err = os_file_get_last_error();
+ err = os_file_get_last_error(FALSE);
if (err == OS_FILE_DISK_FULL) {
/* We only print a warning about disk full once */
@@ -336,6 +346,7 @@ os_file_handle_error(
return(FALSE);
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
+
return(TRUE);
} else if (err == OS_FILE_ALREADY_EXISTS) {
@@ -359,6 +370,68 @@ os_file_handle_error(
}
/********************************************************************
+Does error handling when a file operation fails. */
+static
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ /* out: TRUE if we should retry the
+ operation */
+ os_file_t file, /* in: file pointer */
+ char* name, /* in: name of a file or NULL */
+ const char* operation)/* in: operation */
+{
+ ulint err;
+
+ UT_NOT_USED(file);
+
+ err = os_file_get_last_error(FALSE);
+
+ if (err == OS_FILE_DISK_FULL) {
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(FALSE);
+ }
+
+ if (name) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Encountered a problem with file %s\n", name);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Disk is full. Try to clean the disk to free space.\n");
+
+ os_has_said_disk_full = TRUE;
+
+ fflush(stderr);
+
+ return(FALSE);
+
+ } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
+
+ return(TRUE);
+
+ } else if (err == OS_FILE_ALREADY_EXISTS) {
+
+ return(FALSE);
+ } else {
+ if (name) {
+ fprintf(stderr, "InnoDB: File name %s\n", name);
+ }
+
+ fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
+ operation);
+ return (FALSE);
+ }
+
+ return(FALSE); /* not reached */
+}
+
+/********************************************************************
Creates the seek mutexes used in positioned reads and writes. */
void
@@ -374,6 +447,262 @@ os_io_init_simple(void)
}
}
+/***************************************************************************
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing. */
+
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ /* out: directory stream, NULL if error */
+ char* dirname, /* in: directory name; it must not contain
+ a trailing '\' or '/' */
+ ibool error_is_fatal) /* in: TRUE if we should treat an error as a
+ fatal error; if we try to open symlinks then
+ we do not wish a fatal error if it happens
+ not to be a directory */
+{
+ os_file_dir_t dir;
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ char path[OS_FILE_MAX_PATH + 3];
+
+ ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+ strcpy(path, dirname);
+ strcpy(path + strlen(path), "\\*");
+
+ /* Note that in Windows opening the 'directory stream' also retrieves
+ the first entry in the directory. Since it is '.', that is no problem,
+ as we will skip over the '.' and '..' entries anyway. */
+
+ lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+
+ dir = FindFirstFile(path, lpFindFileData);
+
+ ut_free(lpFindFileData);
+
+ if (dir == INVALID_HANDLE_VALUE) {
+
+ if (error_is_fatal) {
+ os_file_handle_error(NULL, dirname, "opendir");
+ }
+
+ return(NULL);
+ }
+
+ return(dir);
+#else
+ dir = opendir(dirname);
+
+ if (dir == NULL && error_is_fatal) {
+ os_file_handle_error(0, dirname, "opendir");
+ }
+
+ return(dir);
+#endif
+}
+
+/***************************************************************************
+Closes a directory stream. */
+
+int
+os_file_closedir(
+/*=============*/
+ /* out: 0 if success, -1 if failure */
+ os_file_dir_t dir) /* in: directory stream */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = FindClose(dir);
+
+ if (!ret) {
+ os_file_handle_error_no_exit(NULL, NULL, "closedir");
+
+ return(-1);
+ }
+
+ return(0);
+#else
+ int ret;
+
+ ret = closedir(dir);
+
+ if (ret) {
+ os_file_handle_error_no_exit(0, NULL, "closedir");
+ }
+
+ return(ret);
+#endif
+}
+
+/***************************************************************************
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory. */
+
+int
+os_file_readdir_next_file(
+/*======================*/
+ /* out: 0 if ok, -1 if error, 1 if at the end
+ of the directory */
+ char* dirname,/* in: directory name or path */
+ os_file_dir_t dir, /* in: directory stream */
+ os_file_stat_t* info) /* in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ BOOL ret;
+
+ lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+next_file:
+ ret = FindNextFile(dir, lpFindFileData);
+
+ if (ret) {
+ ut_a(strlen(lpFindFileData->cFileName) < OS_FILE_MAX_PATH);
+
+ if (strcmp(lpFindFileData->cFileName, ".") == 0
+ || strcmp(lpFindFileData->cFileName, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, lpFindFileData->cFileName);
+
+ info->size = (ib_longlong)(lpFindFileData->nFileSizeLow)
+ + (((ib_longlong)(lpFindFileData->nFileSizeHigh)) << 32);
+
+ if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_REPARSE_POINT) {
+/* TODO: test Windows symlinks */
+/* TODO: MySQL has apparently its own symlink implementation in Windows,
+dbname.sym can redirect a database directory:
+http://www.mysql.com/doc/en/Windows_symbolic_links.html */
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_DIRECTORY) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_NORMAL) {
+/* TODO: are FILE_ATTRIBUTE_NORMAL files really all normal files? */
+ info->type = OS_FILE_TYPE_FILE;
+ } else {
+ info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+ }
+
+ ut_free(lpFindFileData);
+
+ if (ret) {
+ return(0);
+ } else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+ return(1);
+ } else {
+ os_file_handle_error_no_exit(NULL, dirname,
+ "readdir_next_file");
+ return(-1);
+ }
+#else
+ struct dirent* ent;
+ char* full_path;
+ int ret;
+ struct stat statinfo;
+next_file:
+ ent = readdir(dir);
+
+ if (ent == NULL) {
+ return(1);
+ }
+
+ ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+ if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, ent->d_name);
+
+ full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
+
+ sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+ ret = stat(full_path, &statinfo);
+
+ if (ret) {
+ os_file_handle_error_no_exit(0, full_path, "stat");
+
+ ut_free(full_path);
+
+ return(-1);
+ }
+
+ info->size = (ib_longlong)statinfo.st_size;
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_FILE;
+ } else {
+ info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ ut_free(full_path);
+
+ return(0);
+#endif
+}
+
+/*********************************************************************
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true. */
+
+ibool
+os_file_create_directory(
+/*=====================*/
+ /* out: TRUE if call succeeds, FALSE on
+ error */
+ char* pathname, /* in: directory name as null-terminated
+ string */
+ ibool fail_if_exists) /* in: if TRUE, pre-existing directory is
+ treated as an error. */
+{
+#ifdef __WIN__
+ BOOL rcode;
+
+ rcode = CreateDirectory(pathname, NULL);
+ if (!(rcode != 0 ||
+ (GetLastError() == ERROR_FILE_EXISTS && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error(NULL, pathname, "CreateDirectory");
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#else
+ int rcode;
+
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error(0, pathname, "mkdir");
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#endif
+}
+
/********************************************************************
A simple function to open or create a file. */
@@ -381,7 +710,8 @@ os_file_t
os_file_create_simple(
/*==================*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
@@ -493,13 +823,16 @@ os_file_t
os_file_create_simple_no_error_handling(
/*====================================*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
(if does not exist, error), or OS_FILE_CREATE if a new
file is created (if exists, error) */
- ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */
+ ulint access_type,/* in: OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is used by
+ a backup program reading the file */
ibool* success)/* out: TRUE if succeed, FALSE if error */
{
#ifdef __WIN__
@@ -507,6 +840,7 @@ os_file_create_simple_no_error_handling(
DWORD create_flag;
DWORD access;
DWORD attributes = 0;
+ DWORD share_mode = FILE_SHARE_READ;
ut_a(name);
@@ -523,6 +857,13 @@ os_file_create_simple_no_error_handling(
access = GENERIC_READ;
} else if (access_type == OS_FILE_READ_WRITE) {
access = GENERIC_READ | GENERIC_WRITE;
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+ access = GENERIC_READ;
+ share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
+ | FILE_SHARE_WRITE; /* A backup program has to give
+ mysqld the maximum freedom to
+ do what it likes with the
+ file */
} else {
access = 0;
ut_error;
@@ -530,8 +871,7 @@ os_file_create_simple_no_error_handling(
file = CreateFile(name,
access,
- FILE_SHARE_READ,/* file can be read also by other
- processes */
+ share_mode,
NULL, /* default security attributes */
create_flag,
attributes,
@@ -587,13 +927,16 @@ os_file_t
os_file_create(
/*===========*/
/* out, own: handle to the file, not defined if error,
- error number can be retrieved with os_get_last_error */
+ error number can be retrieved with
+ os_file_get_last_error */
char* name, /* in: name of the file or path as a null-terminated
string */
ulint create_mode, /* in: OS_FILE_OPEN if an existing file is opened
(if does not exist, error), or OS_FILE_CREATE if a new
file is created (if exists, error), OS_FILE_OVERWRITE
- if a new is created or an old overwritten */
+ if a new is created or an old overwritten,
+ OS_FILE_OPEN_RAW, if a raw device or disk partition
+ should be opened */
ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
is desired, OS_FILE_NORMAL, if any normal file;
NOTE that it also depends on type, os_aio_.. and srv_..
@@ -605,14 +948,17 @@ os_file_create(
{
#ifdef __WIN__
os_file_t file;
+ DWORD share_mode = FILE_SHARE_READ;
DWORD create_flag;
DWORD attributes;
ibool retry;
-
try_again:
ut_a(name);
- if (create_mode == OS_FILE_OPEN) {
+ if (create_mode == OS_FILE_OPEN_RAW) {
+ create_flag = OPEN_EXISTING;
+ share_mode = FILE_SHARE_WRITE;
+ } else if (create_mode == OS_FILE_OPEN) {
create_flag = OPEN_EXISTING;
} else if (create_mode == OS_FILE_CREATE) {
create_flag = CREATE_NEW;
@@ -662,14 +1008,17 @@ try_again:
file = CreateFile(name,
GENERIC_READ | GENERIC_WRITE, /* read and write
access */
- FILE_SHARE_READ,/* File can be read also by other
+ share_mode, /* File can be read also by other
processes; we must give the read
permission because of ibbackup. We do
not give the write permission to
others because if one would succeed to
start 2 instances of mysqld on the
SAME files, that could cause severe
- database corruption! */
+ database corruption! When opening
+ raw disk partitions, Microsoft manuals
+ say that we must give also the write
+ permission. */
NULL, /* default security attributes */
create_flag,
attributes,
@@ -679,8 +1028,8 @@ try_again:
*success = FALSE;
retry = os_file_handle_error(file, name,
- create_mode == OS_FILE_OPEN ?
- "open" : "create");
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
if (retry) {
goto try_again;
}
@@ -700,17 +1049,14 @@ try_again:
try_again:
ut_a(name);
- if (create_mode == OS_FILE_OPEN) {
+ if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW) {
mode_str = "OPEN";
-
create_flag = O_RDWR;
} else if (create_mode == OS_FILE_CREATE) {
mode_str = "CREATE";
-
create_flag = O_RDWR | O_CREAT | O_EXCL;
} else if (create_mode == OS_FILE_OVERWRITE) {
mode_str = "OVERWRITE";
-
create_flag = O_RDWR | O_CREAT | O_TRUNC;
} else {
create_flag = 0;
@@ -767,8 +1113,8 @@ try_again:
*success = FALSE;
retry = os_file_handle_error(file, name,
- create_mode == OS_FILE_OPEN ?
- "open" : "create");
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
if (retry) {
goto try_again;
}
@@ -781,6 +1127,168 @@ try_again:
}
/***************************************************************************
+Deletes a file if it exists. The file has to be closed before calling this. */
+
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+ /* out: TRUE if success */
+ char* name) /* in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if ibbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR)name);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ if (GetLastError() == ERROR_PATH_NOT_FOUND) {
+ /* the file does not exist, this not an error */
+
+ return(TRUE);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ fprintf(stderr,
+"InnoDB: Warning: cannot delete file %s\n"
+"InnoDB: Are you running ibbackup to back up the file?\n", name);
+
+ os_file_get_last_error(TRUE); /* print error information */
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(FALSE);
+ }
+
+ goto loop;
+#else
+ int ret;
+
+ ret = unlink((const char*)name);
+
+ if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error(0, name, "delete");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***************************************************************************
+Deletes a file. The file has to be closed before calling this. */
+
+ibool
+os_file_delete(
+/*===========*/
+ /* out: TRUE if success */
+ char* name) /* in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if ibbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR)name);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ if (GetLastError() == ERROR_PATH_NOT_FOUND) {
+ /* If the file does not exist, we classify this as a 'mild'
+ error and return */
+
+ return(FALSE);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ fprintf(stderr,
+"InnoDB: Warning: cannot delete file %s\n"
+"InnoDB: Are you running ibbackup to back up the file?\n", name);
+
+ os_file_get_last_error(TRUE); /* print error information */
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(FALSE);
+ }
+
+ goto loop;
+#else
+ int ret;
+
+ ret = unlink((const char*)name);
+
+ if (ret != 0) {
+ os_file_handle_error(0, name, "delete");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***************************************************************************
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function. */
+
+ibool
+os_file_rename(
+/*===========*/
+ /* out: TRUE if success */
+ char* oldpath, /* in: old file path as a null-terminated
+ string */
+ char* newpath) /* in: new file path */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, oldpath, "delete");
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = rename((const char*)oldpath, (const char*)newpath);
+
+ if (ret != 0) {
+ os_file_handle_error(0, oldpath, "rename");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***************************************************************************
Closes a file handle. In case of error, error number can be retrieved with
os_file_get_last_error. */
@@ -802,6 +1310,7 @@ os_file_close(
}
os_file_handle_error(file, NULL, "close");
+
return(FALSE);
#else
int ret;
@@ -810,6 +1319,7 @@ os_file_close(
if (ret == -1) {
os_file_handle_error(file, NULL, "close");
+
return(FALSE);
}
@@ -889,7 +1399,7 @@ os_file_get_size(
}
if (sizeof(off_t) > 4) {
- *size = (ulint)(offs & 0xFFFFFFFF);
+ *size = (ulint)(offs & 0xFFFFFFFFUL);
*size_high = (ulint)(offs >> 32);
} else {
*size = (ulint) offs;
@@ -901,6 +1411,29 @@ os_file_get_size(
}
/***************************************************************************
+Gets file size as a 64-bit integer ib_longlong. */
+
+ib_longlong
+os_file_get_size_as_iblonglong(
+/*===========================*/
+ /* out: size in bytes, -1 if error */
+ os_file_t file) /* in: handle to a file */
+{
+ ulint size;
+ ulint size_high;
+ ibool success;
+
+ success = os_file_get_size(file, &size, &size_high);
+
+ if (!success) {
+
+ return(-1);
+ }
+
+ return((((ib_longlong)size_high) << 32) + (ib_longlong)size);
+}
+
+/***************************************************************************
Sets a file size. This function can be used to extend or truncate a file. */
ibool
@@ -966,7 +1499,7 @@ os_file_set_size(
!= offset / (ib_longlong)(100 * 1024 * 1024)) {
fprintf(stderr, " %lu00",
- (ulint)((offset + n_bytes)
+ (ulong) ((offset + n_bytes)
/ (ib_longlong)(100 * 1024 * 1024)));
}
@@ -1012,6 +1545,15 @@ os_file_flush(
return(TRUE);
}
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+
+ if (srv_start_raw_disk_in_use && GetLastError()
+ == ERROR_INVALID_FUNCTION) {
+ return(TRUE);
+ }
+
os_file_handle_error(file, NULL, "flush");
/* It is a fatal error if a file flush does not succeed, because then
@@ -1035,9 +1577,10 @@ os_file_flush(
}
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
- we choose to ignore that error */
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
- if (errno == EINVAL) {
return(TRUE);
}
@@ -1075,7 +1618,7 @@ os_file_pread(
off_t offs;
ssize_t n_bytes;
- ut_a((offset & 0xFFFFFFFF) == offset);
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
/* If off_t is > 4 bytes in size, then we assume we can pass a
64-bit address */
@@ -1151,7 +1694,7 @@ os_file_pwrite(
ssize_t ret;
off_t offs;
- ut_a((offset & 0xFFFFFFFF) == offset);
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
/* If off_t is > 4 bytes in size, then we assume we can pass a
64-bit address */
@@ -1255,7 +1798,7 @@ os_file_read(
ibool retry;
ulint i;
- ut_a((offset & 0xFFFFFFFF) == offset);
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
os_n_file_reads++;
os_bytes_read_since_printout += n;
@@ -1315,9 +1858,9 @@ error_handling:
fprintf(stderr,
"InnoDB: Fatal error: cannot read from file. OS error number %lu.\n",
#ifdef __WIN__
- (ulint)GetLastError()
+ (ulong) GetLastError()
#else
- (ulint)errno
+ (ulong) errno
#endif
);
fflush(stderr);
@@ -1328,6 +1871,92 @@ error_handling:
}
/***********************************************************************
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE. */
+
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+ os_file_t file, /* in: handle to a file */
+ void* buf, /* in: buffer where to read */
+ ulint offset, /* in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high, /* in: most significant 32 bits of
+ offset */
+ ulint n) /* in: number of bytes to read */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ibool retry;
+ ulint i;
+
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ut_ad(file);
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ low = offset;
+ high = offset_high;
+
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+
+ ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+ os_mutex_exit(os_file_seek_mutexes[i]);
+
+ goto error_handling;
+ }
+
+ ret = ReadFile(file, buf, n, &len, NULL);
+
+ os_mutex_exit(os_file_seek_mutexes[i]);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset, offset_high);
+
+ if ((ulint)ret == n) {
+
+ return(TRUE);
+ }
+#endif
+#ifdef __WIN__
+error_handling:
+#endif
+ retry = os_file_handle_error_no_exit(file, NULL, "read");
+
+ if (retry) {
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************
Requests a synchronous write operation. */
ibool
@@ -1384,8 +2013,8 @@ retry:
"InnoDB: offset %lu %lu. Operating system error number %lu.\n"
"InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n"
"InnoDB: what the error number means.\n",
- name, offset_high, offset,
- (ulint)GetLastError());
+ name, (ulong) offset_high, (ulong) offset,
+ (ulong) GetLastError());
return(FALSE);
}
@@ -1431,12 +2060,12 @@ retry:
"InnoDB: Operating system error number %lu.\n"
"InnoDB: Check that your OS and file system support files of this size.\n"
"InnoDB: Check also that the disk is not full or a disk quota exceeded.\n",
- name, offset_high, offset, n, (ulint)len,
- err);
+ name, (ulong) offset_high, (ulong) offset,
+ (ulong) n, (ulong) len, (ulong) err);
if (strerror((int)err) != NULL) {
fprintf(stderr,
-"InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err));
+"InnoDB: Error number %lu means '%s'.\n", (ulong) err, strerror((int)err));
}
fprintf(stderr,
@@ -2482,7 +3111,7 @@ os_aio_simulated_handle(
ulint biggest_age;
ulint age;
byte* combined_buf;
- byte* combined_buf2= 0; /* Remove warning */
+ byte* combined_buf2;
ibool ret;
ulint n;
ulint i;
@@ -2522,7 +3151,7 @@ restart:
if (os_aio_print_debug) {
fprintf(stderr,
-"InnoDB: i/o for slot %lu already done, returning\n", i);
+"InnoDB: i/o for slot %lu already done, returning\n", (ulong) i);
}
ret = TRUE;
@@ -2669,8 +3298,8 @@ consecutive_loop:
if (os_aio_print_debug) {
fprintf(stderr,
"InnoDB: doing i/o of type %lu at offset %lu %lu, length %lu\n",
- slot->type, slot->offset_high, slot->offset,
- total_len);
+ (ulong) slot->type, (ulong) slot->offset_high,
+ (ulong) slot->offset, (ulong) total_len);
}
/* Do the i/o with ordinary, synchronous i/o functions: */
@@ -2680,8 +3309,9 @@ consecutive_loop:
|| (slot->offset % UNIV_PAGE_SIZE != 0)) {
fprintf(stderr,
"InnoDB: Error: trying a displaced write to %s %lu %lu, len %lu\n",
- slot->name, slot->offset_high,
- slot->offset, total_len);
+ slot->name, (ulong) slot->offset_high,
+ (ulong) slot->offset,
+ (ulong) total_len);
ut_error;
}
@@ -2780,7 +3410,7 @@ recommended_sleep:
if (os_aio_print_debug) {
fprintf(stderr,
"InnoDB: i/o handler thread for i/o segment %lu wakes up\n",
- global_segment);
+ (ulong) global_segment);
}
goto restart;
@@ -2862,7 +3492,8 @@ os_aio_print(
}
for (i = 0; i < srv_n_file_io_threads; i++) {
- buf += sprintf(buf, "I/O thread %lu state: %s (%s)\n", i,
+ buf += sprintf(buf, "I/O thread %lu state: %s (%s)\n",
+ (ulong) i,
srv_io_thread_op_info[i],
srv_io_thread_function[i]);
}
@@ -2894,7 +3525,7 @@ loop:
ut_a(array->n_reserved == n_reserved);
- buf += sprintf(buf, " %lu", n_reserved);
+ buf += sprintf(buf, " %lu", (ulong) n_reserved);
os_mutex_exit(array->mutex);
@@ -2934,15 +3565,18 @@ loop:
buf += sprintf(buf,
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n",
- fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes);
+ (ulong) fil_n_pending_log_flushes,
+ (ulong) fil_n_pending_tablespace_flushes);
buf += sprintf(buf,
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
- os_n_file_reads, os_n_file_writes, os_n_fsyncs);
+ (ulong) os_n_file_reads, (ulong) os_n_file_writes,
+ (ulong) os_n_fsyncs);
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
buf += sprintf(buf,
"%lu pending preads, %lu pending pwrites\n",
- os_file_n_pending_preads, os_file_n_pending_pwrites);
+ (ulong) os_file_n_pending_preads,
+ (ulong) os_file_n_pending_pwrites);
}
if (os_n_file_reads == os_n_file_reads_old) {
@@ -2956,7 +3590,7 @@ loop:
"%.2f reads/s, %lu avg bytes/read, %.2f writes/s, %.2f fsyncs/s\n",
(os_n_file_reads - os_n_file_reads_old)
/ time_elapsed,
- (ulint)avg_bytes_read,
+ (ulong)avg_bytes_read,
(os_n_file_writes - os_n_file_writes_old)
/ time_elapsed,
(os_n_fsyncs - os_n_fsyncs_old)
diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c
index 2099d62e7fd..85791c55348 100644
--- a/innobase/os/os0proc.c
+++ b/innobase/os/os0proc.c
@@ -12,11 +12,469 @@ Created 9/30/1995 Heikki Tuuri
#include "os0proc.ic"
#endif
+#include "ut0mem.h"
+#include "ut0byte.h"
+
+
+/*
+How to get AWE to compile on Windows?
+-------------------------------------
+
+In the project settings of the innobase project the Visual C++ source,
+__WIN2000__ has to be defined.
+
+The Visual C++ has to be relatively recent and _WIN32_WINNT has to be
+defined to a value >= 0x0500 when windows.h is included.
+
+#define _WIN32_WINNT 0x0500
+
+Where does AWE work?
+-------------------
+
+See the error message in os_awe_allocate_physical_mem().
+
+How to assign privileges for mysqld to use AWE?
+-----------------------------------------------
+
+See the error message in os_awe_enable_lock_pages_in_mem().
+
+Use Windows AWE functions in this order
+---------------------------------------
+
+(1) os_awe_enable_lock_pages_in_mem();
+(2) os_awe_allocate_physical_mem();
+(3) os_awe_allocate_virtual_mem_window();
+(4) os_awe_map_physical_mem_to_window().
+
+To test 'AWE' in a computer which does not have the AWE API,
+you can compile with UNIV_SIMULATE_AWE defined in this file.
+*/
+
+#ifdef UNIV_SIMULATE_AWE
+/* If we simulate AWE, we allocate the 'physical memory' here */
+byte* os_awe_simulate_mem;
+ulint os_awe_simulate_mem_size;
+os_awe_t* os_awe_simulate_page_info;
+byte* os_awe_simulate_window;
+ulint os_awe_simulate_window_size;
+/* In simulated AWE the following contains a NULL pointer or a pointer
+to a mapped 'physical page' for each 4 kB page in the AWE window */
+byte** os_awe_simulate_map;
+#endif
+
+#ifdef __WIN2000__
+os_awe_t* os_awe_page_info;
+ulint os_awe_n_pages;
+byte* os_awe_window;
+ulint os_awe_window_size;
+#endif
+
+/********************************************************************
+Windows AWE support. Tries to enable the "lock pages in memory" privilege for
+the current process so that the current process can allocate memory-locked
+virtual address space to act as the window where AWE maps physical memory. */
+
+ibool
+os_awe_enable_lock_pages_in_mem(void)
+/*=================================*/
+ /* out: TRUE if success, FALSE if error;
+ prints error info to stderr if no success */
+{
+#ifdef UNIV_SIMULATE_AWE
+
+ return(TRUE);
+
+#elif defined(__WIN2000__)
+ struct {
+ DWORD Count;
+ LUID_AND_ATTRIBUTES Privilege[1];
+ } Info;
+ HANDLE hProcess;
+ HANDLE Token;
+ BOOL Result;
+
+ hProcess = GetCurrentProcess();
+
+ /* Open the token of the current process */
+
+ Result = OpenProcessToken(hProcess,
+ TOKEN_ADJUST_PRIVILEGES,
+ &Token);
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot open process token, error %lu\n",
+ (ulint)GetLastError());
+ return(FALSE);
+ }
+
+ Info.Count = 1;
+
+ Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+ /* Get the local unique identifier (LUID) of the SE_LOCK_MEMORY
+ privilege */
+
+ Result = LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
+ &(Info.Privilege[0].Luid));
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot get local privilege value for %s, error %lu.\n",
+ SE_LOCK_MEMORY_NAME, (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ /* Try to adjust the privilege */
+
+ Result = AdjustTokenPrivileges(Token, FALSE,
+ (PTOKEN_PRIVILEGES)&Info,
+ 0, NULL, NULL);
+ /* Check the result */
+
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot adjust process token privileges, error %u.\n",
+ GetLastError());
+ return(FALSE);
+ } else if (GetLastError() != ERROR_SUCCESS) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot enable SE_LOCK_MEMORY privilege, error %lu.\n"
+"InnoDB: In Windows XP Home you cannot use AWE. In Windows 2000 and XP\n"
+"InnoDB: Professional you must go to the Control Panel, to\n"
+"InnoDB: Security Settings, to Local Policies, and enable\n"
+"InnoDB: the 'lock pages in memory' privilege for the user who runs\n"
+"InnoDB: the MySQL server.\n", GetLastError());
+
+ return(FALSE);
+ }
+
+ CloseHandle(Token);
+
+ return(TRUE);
+#else
#ifdef __WIN__
-#include <windows.h>
+ fprintf(stderr,
+"InnoDB: AWE: Error: to use AWE you must use a ...-nt MySQL executable.\n");
+#endif
+ return(FALSE);
#endif
+}
-#include "ut0mem.h"
+/********************************************************************
+Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86
+processor. */
+
+ibool
+os_awe_allocate_physical_mem(
+/*=========================*/
+ /* out: TRUE if success */
+ os_awe_t** page_info, /* out, own: array of opaque data containing
+ the info for allocated physical memory pages;
+ each allocated 4 kB physical memory page has
+ one slot of type os_awe_t in the array */
+ ulint n_megabytes) /* in: number of megabytes to allocate */
+{
+#ifdef UNIV_SIMULATE_AWE
+ os_awe_simulate_page_info = ut_malloc(sizeof(os_awe_t) *
+ n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE));
+
+ os_awe_simulate_mem = ut_align(ut_malloc(
+ 4096 + 1024 * 1024 * n_megabytes),
+ 4096);
+ os_awe_simulate_mem_size = n_megabytes * 1024 * 1024;
+
+ *page_info = os_awe_simulate_page_info;
+
+ return(TRUE);
+
+#elif defined(__WIN2000__)
+ BOOL bResult;
+ os_awe_t NumberOfPages; /* Question: why does Windows
+ use the name ULONG_PTR for
+ a scalar integer type? Maybe
+ because we may also refer to
+ &NumberOfPages? */
+ os_awe_t NumberOfPagesInitial;
+ SYSTEM_INFO sSysInfo;
+ int PFNArraySize;
+
+ if (n_megabytes > 64 * 1024) {
+
+ fprintf(stderr,
+"InnoDB: AWE: Error: tried to allocate %lu MB.\n"
+"InnoDB: AWE cannot allocate more than 64 GB in any computer.\n", n_megabytes);
+
+ return(FALSE);
+ }
+
+ GetSystemInfo(&sSysInfo); /* fill the system information structure */
+
+ if ((ulint)OS_AWE_X86_PAGE_SIZE != (ulint)sSysInfo.dwPageSize) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: this computer has a page size of %lu.\n"
+"InnoDB: Should be 4096 bytes for InnoDB AWE support to work.\n",
+ (ulint)sSysInfo.dwPageSize);
+
+ return(FALSE);
+ }
+
+ /* Calculate the number of pages of memory to request */
+
+ NumberOfPages = n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE);
+
+ /* Calculate the size of page_info for allocated physical pages */
+
+ PFNArraySize = NumberOfPages * sizeof(os_awe_t);
+
+ *page_info = (os_awe_t*)HeapAlloc(GetProcessHeap(), 0, PFNArraySize);
+
+ if (*page_info == NULL) {
+ fprintf(stderr,
+"InnoDB: AWE: Failed to allocate page info array from process heap, error %lu\n",
+ (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ ut_total_allocated_memory += PFNArraySize;
+
+ /* Enable this process' privilege to lock pages to physical memory */
+
+ if (!os_awe_enable_lock_pages_in_mem()) {
+
+ return(FALSE);
+ }
+
+ /* Allocate the physical memory */
+
+ NumberOfPagesInitial = NumberOfPages;
+
+ os_awe_page_info = *page_info;
+ os_awe_n_pages = (ulint)NumberOfPages;
+
+ /* Compilation note: if the compiler complains the function is not
+ defined, see the note at the start of this file */
+
+ bResult = AllocateUserPhysicalPages(GetCurrentProcess(),
+ &NumberOfPages,
+ *page_info);
+ if (bResult != TRUE) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate physical pages, error %lu.\n",
+ (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ if (NumberOfPagesInitial != NumberOfPages) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: allocated only %lu pages of %lu requested.\n"
+"InnoDB: Check that you have enough free RAM.\n"
+"InnoDB: In Windows XP Professional and 2000 Professional\n"
+"InnoDB: Windows PAE size is max 4 GB. In 2000 and .NET\n"
+"InnoDB: Advanced Servers and 2000 Datacenter Server it is 32 GB,\n"
+"InnoDB: and in .NET Datacenter Server it is 64 GB.\n"
+"InnoDB: A Microsoft web page said that the processor must be an Intel\n"
+"InnoDB: processor.\n",
+ (ulint)NumberOfPages,
+ (ulint)NumberOfPagesInitial);
+
+ return(FALSE);
+ }
+
+ fprintf(stderr,
+"InnoDB: Using Address Windowing Extensions (AWE); allocated %lu MB\n",
+ n_megabytes);
+
+ return(TRUE);
+#else
+ return(FALSE);
+#endif
+}
+
+/********************************************************************
+Allocates a window in the virtual address space where we can map then
+pages of physical memory. */
+
+byte*
+os_awe_allocate_virtual_mem_window(
+/*===============================*/
+ /* out, own: allocated memory, or NULL if did not
+ succeed */
+ ulint size) /* in: virtual memory allocation size in bytes, must
+ be < 2 GB */
+{
+#ifdef UNIV_SIMULATE_AWE
+ ulint i;
+
+ os_awe_simulate_window = ut_align(ut_malloc(4096 + size), 4096);
+ os_awe_simulate_window_size = size;
+
+ os_awe_simulate_map = ut_malloc(sizeof(byte*) * (size / 4096));
+
+ for (i = 0; i < (size / 4096); i++) {
+ *(os_awe_simulate_map + i) = NULL;
+ }
+
+ return(os_awe_simulate_window);
+
+#elif defined(__WIN2000__)
+ byte* ptr;
+
+ if (size > (ulint)0x7FFFFFFFUL) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory\n", size);
+
+ return(NULL);
+ }
+
+ ptr = VirtualAlloc(NULL, (SIZE_T)size, MEM_RESERVE | MEM_PHYSICAL,
+ PAGE_READWRITE);
+ if (ptr == NULL) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory, error %lu\n",
+ size, (ulint)GetLastError());
+
+ return(NULL);
+ }
+
+ os_awe_window = ptr;
+ os_awe_window_size = size;
+
+ ut_total_allocated_memory += size;
+
+ return(ptr);
+#else
+ return(NULL);
+#endif
+}
+
+/********************************************************************
+With this function you can map parts of physical memory allocated with
+the ..._allocate_physical_mem to the virtual address space allocated with
+the previous function. Intel implements this so that the process page
+tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP
+showed that this takes < 1 microsecond, much better than the estimated 80 us
+for copying a 16 kB page memory to memory. But, the operation will at least
+partially invalidate the translation lookaside buffer (TLB) of all
+processors. Under a real-world load the performance hit may be bigger. */
+
+ibool
+os_awe_map_physical_mem_to_window(
+/*==============================*/
+ /* out: TRUE if success; the function
+ calls exit(1) in case of an error */
+ byte* ptr, /* in: a page-aligned pointer to
+ somewhere in the virtual address
+ space window; we map the physical mem
+ pages here */
+ ulint n_mem_pages, /* in: number of 4 kB mem pages to
+ map */
+ os_awe_t* page_info) /* in: array of page infos for those
+ pages; each page has one slot in the
+ array */
+{
+#ifdef UNIV_SIMULATE_AWE
+ ulint i;
+ byte** map;
+ byte* page;
+ byte* phys_page;
+
+ ut_a(ptr >= os_awe_simulate_window);
+ ut_a(ptr < os_awe_simulate_window + os_awe_simulate_window_size);
+ ut_a(page_info >= os_awe_simulate_page_info);
+ ut_a(page_info < os_awe_simulate_page_info +
+ (os_awe_simulate_mem_size / 4096));
+
+ /* First look if some other 'physical pages' are mapped at ptr,
+ and copy them back to where they were if yes */
+
+ map = os_awe_simulate_map
+ + ((ulint)(ptr - os_awe_simulate_window)) / 4096;
+ page = ptr;
+
+ for (i = 0; i < n_mem_pages; i++) {
+ if (*map != NULL) {
+ ut_memcpy(*map, page, 4096);
+ }
+ map++;
+ page += 4096;
+ }
+
+ /* Then copy to ptr the 'physical pages' determined by page_info; we
+ assume page_info is a segment of the array we created at the start */
+
+ phys_page = os_awe_simulate_mem
+ + (ulint)(page_info - os_awe_simulate_page_info)
+ * 4096;
+
+ ut_memcpy(ptr, phys_page, n_mem_pages * 4096);
+
+ /* Update the map */
+
+ map = os_awe_simulate_map
+ + ((ulint)(ptr - os_awe_simulate_window)) / 4096;
+
+ for (i = 0; i < n_mem_pages; i++) {
+ *map = phys_page;
+
+ map++;
+ phys_page += 4096;
+ }
+
+ return(TRUE);
+
+#elif defined(__WIN2000__)
+ BOOL bResult;
+ os_awe_t n_pages;
+
+ n_pages = (os_awe_t)n_mem_pages;
+
+ if (!(ptr >= os_awe_window)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map to address %lx but AWE window start %lx\n",
+ (ulint)ptr, (ulint)os_awe_window);
+ ut_a(0);
+ }
+
+ if (!(ptr <= os_awe_window + os_awe_window_size - UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map to address %lx but AWE window end %lx\n",
+ (ulint)ptr, (ulint)os_awe_window + os_awe_window_size);
+ ut_a(0);
+ }
+
+ if (!(page_info >= os_awe_page_info)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map page info at %lx but array start %lx\n",
+ (ulint)page_info, (ulint)os_awe_page_info);
+ ut_a(0);
+ }
+
+ if (!(page_info <= os_awe_page_info + (os_awe_n_pages - 4))) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map page info at %lx but array end %lx\n",
+ (ulint)page_info, (ulint)(os_awe_page_info + os_awe_n_pages));
+ ut_a(0);
+ }
+
+ bResult = MapUserPhysicalPages((PVOID)ptr, n_pages, page_info);
+
+ if (bResult != TRUE) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: AWE: Mapping of %lu physical pages to address %lx failed,\n"
+"InnoDB: error %lu.\n"
+"InnoDB: Cannot continue operation.\n",
+ n_mem_pages, (ulint)ptr, (ulint)GetLastError());
+ exit(1);
+ }
+
+ return(TRUE);
+#else
+ return(FALSE);
+#endif
+}
/********************************************************************
Converts the current process id to a number. It is not guaranteed that the
diff --git a/innobase/os/os0sync.c b/innobase/os/os0sync.c
index 827d68501db..7cbaf1f5123 100644
--- a/innobase/os/os0sync.c
+++ b/innobase/os/os0sync.c
@@ -125,7 +125,7 @@ os_event_create(
if (!event->handle) {
fprintf(stderr,
"InnoDB: Could not create a Windows event semaphore; Windows error %lu\n",
- (ulint)GetLastError());
+ (ulong) GetLastError());
}
#else /* Unix */
os_event_t event;
@@ -182,7 +182,7 @@ os_event_create_auto(
if (!event->handle) {
fprintf(stderr,
"InnoDB: Could not create a Windows auto event semaphore; Windows error %lu\n",
- (ulint)GetLastError());
+ (ulong) GetLastError());
}
/* Put to the list of events */
@@ -412,7 +412,7 @@ os_event_wait_multiple(
FALSE, /* Wait for any 1 event */
INFINITE); /* Infinite wait time
limit */
- ut_a(index >= WAIT_OBJECT_0);
+ ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparision */
ut_a(index < WAIT_OBJECT_0 + n);
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c
index b08efacf43a..eb2ba5b8bf8 100644
--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@@ -629,7 +629,7 @@ page_cur_parse_insert_rec(
return(NULL);
}
- extra_info_yes = end_seg_len & 0x1;
+ extra_info_yes = end_seg_len & 0x1UL;
end_seg_len = end_seg_len / 2;
if (end_seg_len >= UNIV_PAGE_SIZE) {
@@ -702,11 +702,14 @@ page_cur_parse_insert_rec(
/* Build the inserted record to buf */
if (mismatch_index >= UNIV_PAGE_SIZE) {
- printf("Is short %lu, info_bits %lu, offset %lu, o_offset %lu\n"
+ printf(
+ "Is short %lu, info_bits %lu, offset %lu, o_offset %lu\n"
"mismatch index %lu, end_seg_len %lu\n"
"parsed len %lu\n",
- is_short, info_bits, offset, origin_offset,
- mismatch_index, end_seg_len, (ulint)(ptr - ptr2));
+ (ulong) is_short, (ulong) info_bits, (ulong) offset,
+ (ulong) origin_offset,
+ (ulong) mismatch_index, (ulong) end_seg_len,
+ (ulong) (ptr - ptr2));
printf("Dump of 300 bytes of log:\n");
ut_print_buf(ptr2, 300);
diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c
index 21adcdea635..c64a7590b94 100644
--- a/innobase/page/page0page.c
+++ b/innobase/page/page0page.c
@@ -94,13 +94,13 @@ page_dir_find_owner_slot(
if (i == 0) {
fprintf(stderr,
"InnoDB: Probable data corruption on page %lu\n",
- buf_frame_get_page_no(page));
+ (ulong) buf_frame_get_page_no(page));
rec_sprintf(err_buf, 900, original_rec);
fprintf(stderr,
"InnoDB: Original record %s\n"
- "InnoDB: on that page. Steps %lu.\n", err_buf, steps);
+ "InnoDB: on that page. Steps %lu.\n", err_buf, (ulong) steps);
rec_sprintf(err_buf, 900, rec);
@@ -438,9 +438,9 @@ page_copy_rec_list_end_no_locks(
fprintf(stderr,
"InnoDB: rec offset %lu, cur1 offset %lu, cur2 offset %lu\n",
- (ulint)(rec - page),
- (ulint)(page_cur_get_rec(&cur1) - page),
- (ulint)(page_cur_get_rec(&cur2) - new_page));
+ (ulong)(rec - page),
+ (ulong)(page_cur_get_rec(&cur1) - page),
+ (ulong)(page_cur_get_rec(&cur2) - new_page));
ut_error;
}
@@ -554,7 +554,8 @@ byte*
page_parse_delete_rec_list(
/*=======================*/
/* out: end of log record or NULL */
- byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */
+ byte type, /* in: MLOG_LIST_END_DELETE or
+ MLOG_LIST_START_DELETE */
byte* ptr, /* in: buffer */
byte* end_ptr,/* in: buffer end */
page_t* page, /* in: page or NULL */
@@ -1123,9 +1124,9 @@ page_rec_print(
rec_print(rec);
printf(
" n_owned: %lu; heap_no: %lu; next rec: %lu\n",
- rec_get_n_owned(rec),
- rec_get_heap_no(rec),
- rec_get_next_offs(rec));
+ (ulong) rec_get_n_owned(rec),
+ (ulong) rec_get_heap_no(rec),
+ (ulong) rec_get_next_offs(rec));
page_rec_check(rec);
rec_validate(rec);
@@ -1149,9 +1150,9 @@ page_dir_print(
printf("--------------------------------\n");
printf("PAGE DIRECTORY\n");
- printf("Page address %lx\n", (ulint)page);
+ printf("Page address %lx\n", (ulong)page);
printf("Directory stack top at offs: %lu; number of slots: %lu\n",
- (ulint)(page_dir_get_nth_slot(page, n - 1) - page), n);
+ (ulong)(page_dir_get_nth_slot(page, n - 1) - page), (ulong) n);
for (i = 0; i < n; i++) {
slot = page_dir_get_nth_slot(page, i);
if ((i == pr_n) && (i < n - pr_n)) {
@@ -1160,11 +1161,11 @@ page_dir_print(
if ((i < pr_n) || (i >= n - pr_n)) {
printf(
"Contents of slot: %lu: n_owned: %lu, rec offs: %lu\n",
- i, page_dir_slot_get_n_owned(slot),
- (ulint)(page_dir_slot_get_rec(slot) - page));
+ (ulong) i, (ulong) page_dir_slot_get_n_owned(slot),
+ (ulong)(page_dir_slot_get_rec(slot) - page));
}
}
- printf("Total of %lu records\n", 2 + page_get_n_recs(page));
+ printf("Total of %lu records\n", (ulong) (2 + page_get_n_recs(page)));
printf("--------------------------------\n");
}
@@ -1185,7 +1186,7 @@ page_print_list(
printf("--------------------------------\n");
printf("PAGE RECORD LIST\n");
- printf("Page address %lu\n", (ulint)page);
+ printf("Page address %lu\n", (ulong) page);
n_recs = page_get_n_recs(page);
@@ -1222,7 +1223,7 @@ page_print_list(
count++;
}
- printf("Total of %lu records \n", count + 1);
+ printf("Total of %lu records \n", (ulong) (count + 1));
printf("--------------------------------\n");
}
@@ -1236,22 +1237,22 @@ page_header_print(
{
printf("--------------------------------\n");
printf("PAGE HEADER INFO\n");
- printf("Page address %lx, n records %lu\n", (ulint)page,
- page_header_get_field(page, PAGE_N_RECS));
+ printf("Page address %lx, n records %lu\n", (ulong) page,
+ (ulong) page_header_get_field(page, PAGE_N_RECS));
printf("n dir slots %lu, heap top %lu\n",
- page_header_get_field(page, PAGE_N_DIR_SLOTS),
- page_header_get_field(page, PAGE_HEAP_TOP));
+ (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS),
+ (ulong) page_header_get_field(page, PAGE_HEAP_TOP));
printf("Page n heap %lu, free %lu, garbage %lu\n",
- page_header_get_field(page, PAGE_N_HEAP),
- page_header_get_field(page, PAGE_FREE),
- page_header_get_field(page, PAGE_GARBAGE));
+ (ulong) page_header_get_field(page, PAGE_N_HEAP),
+ (ulong) page_header_get_field(page, PAGE_FREE),
+ (ulong) page_header_get_field(page, PAGE_GARBAGE));
printf("Page last insert %lu, direction %lu, n direction %lu\n",
- page_header_get_field(page, PAGE_LAST_INSERT),
- page_header_get_field(page, PAGE_DIRECTION),
- page_header_get_field(page, PAGE_N_DIRECTION));
+ (ulong) page_header_get_field(page, PAGE_LAST_INSERT),
+ (ulong) page_header_get_field(page, PAGE_DIRECTION),
+ (ulong) page_header_get_field(page, PAGE_N_DIRECTION));
}
/*******************************************************************
@@ -1296,15 +1297,15 @@ page_rec_validate(
if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) {
fprintf(stderr,
"InnoDB: Dir slot of rec %lu, n owned too big %lu\n",
- (ulint)(rec - page), n_owned);
+ (ulong)(rec - page), (ulong) n_owned);
return(FALSE);
}
if (!(heap_no < page_header_get_field(page, PAGE_N_HEAP))) {
fprintf(stderr,
"InnoDB: Heap no of rec %lu too big %lu %lu\n",
- (ulint)(rec - page), heap_no,
- page_header_get_field(page, PAGE_N_HEAP));
+ (ulong)(rec - page), (ulong) heap_no,
+ (ulong) page_header_get_field(page, PAGE_N_HEAP));
return(FALSE);
}
@@ -1370,7 +1371,7 @@ page_simple_validate(
if (n_slots > UNIV_PAGE_SIZE / 4) {
fprintf(stderr,
- "InnoDB: Nonsensical number %lu of page dir slots\n", n_slots);
+ "InnoDB: Nonsensical number %lu of page dir slots\n", (ulong) n_slots);
goto func_exit;
}
@@ -1381,8 +1382,8 @@ page_simple_validate(
fprintf(stderr,
"InnoDB: Record heap and dir overlap on a page, heap top %lu, dir %lu\n",
- (ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page),
- (ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page));
+ (ulong)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page),
+ (ulong)(page_dir_get_nth_slot(page, n_slots - 1) - page));
goto func_exit;
}
@@ -1403,7 +1404,7 @@ page_simple_validate(
if (rec > rec_heap_top) {
fprintf(stderr,
"InnoDB: Record %lu is above rec heap top %lu\n",
- (ulint)(rec - page), (ulint)(rec_heap_top - page));
+ (ulong)(rec - page), (ulong)(rec_heap_top - page));
goto func_exit;
}
@@ -1414,8 +1415,9 @@ page_simple_validate(
fprintf(stderr,
"InnoDB: Wrong owned count %lu, %lu, rec %lu\n",
- rec_get_n_owned(rec), own_count,
- (ulint)(rec - page));
+ (ulong) rec_get_n_owned(rec),
+ (ulong) own_count,
+ (ulong)(rec - page));
goto func_exit;
}
@@ -1423,7 +1425,7 @@ page_simple_validate(
if (page_dir_slot_get_rec(slot) != rec) {
fprintf(stderr,
"InnoDB: Dir slot does not point to right rec %lu\n",
- (ulint)(rec - page));
+ (ulong)(rec - page));
goto func_exit;
}
@@ -1445,8 +1447,8 @@ page_simple_validate(
|| rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) {
fprintf(stderr,
"InnoDB: Next record offset nonsensical %lu for rec %lu\n",
- rec_get_next_offs(rec),
- (ulint)(rec - page));
+ (ulong) rec_get_next_offs(rec),
+ (ulong)(rec - page));
goto func_exit;
}
@@ -1456,7 +1458,7 @@ page_simple_validate(
if (count > UNIV_PAGE_SIZE) {
fprintf(stderr,
"InnoDB: Page record list appears to be circular %lu\n",
- count);
+ (ulong) count);
goto func_exit;
}
@@ -1472,13 +1474,14 @@ page_simple_validate(
if (slot_no != n_slots - 1) {
fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
- slot_no, n_slots - 1);
+ (ulong) slot_no, (ulong) (n_slots - 1));
goto func_exit;
}
if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) {
fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
- page_header_get_field(page, PAGE_N_RECS) + 2, count + 1);
+ (ulong) page_header_get_field(page, PAGE_N_RECS) + 2,
+ (ulong) (count + 1));
goto func_exit;
}
@@ -1491,7 +1494,7 @@ page_simple_validate(
|| rec >= page + UNIV_PAGE_SIZE) {
fprintf(stderr,
"InnoDB: Free list record has a nonsensical offset %lu\n",
- (ulint)(rec - page));
+ (ulong)(rec - page));
goto func_exit;
}
@@ -1499,7 +1502,7 @@ page_simple_validate(
if (rec > rec_heap_top) {
fprintf(stderr,
"InnoDB: Free list record %lu is above rec heap top %lu\n",
- (ulint)(rec - page), (ulint)(rec_heap_top - page));
+ (ulong)(rec - page), (ulong)(rec_heap_top - page));
goto func_exit;
}
@@ -1509,7 +1512,7 @@ page_simple_validate(
if (count > UNIV_PAGE_SIZE) {
fprintf(stderr,
"InnoDB: Page free list appears to be circular %lu\n",
- count);
+ (ulong) count);
goto func_exit;
}
@@ -1519,7 +1522,8 @@ page_simple_validate(
if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) {
fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
- page_header_get_field(page, PAGE_N_HEAP), count + 1);
+ (ulong) page_header_get_field(page, PAGE_N_HEAP),
+ (ulong) (count + 1));
goto func_exit;
}
@@ -1560,7 +1564,7 @@ page_validate(
if (!page_simple_validate(page)) {
fprintf(stderr,
"InnoDB: Apparent corruption in page %lu in index %s in table %s\n",
- buf_frame_get_page_no(page), index->name,
+ (ulong) buf_frame_get_page_no(page), index->name,
index->table_name);
buf_page_print(page);
@@ -1587,8 +1591,8 @@ page_validate(
page_dir_get_nth_slot(page, n_slots - 1))) {
fprintf(stderr,
"InnoDB: Record heap and dir overlap on a page in index %s, %lu, %lu\n",
- index->name, (ulint)page_header_get_ptr(page, PAGE_HEAP_TOP),
- (ulint)page_dir_get_nth_slot(page, n_slots - 1));
+ index->name, (ulong)page_header_get_ptr(page, PAGE_HEAP_TOP),
+ (ulong)page_dir_get_nth_slot(page, n_slots - 1));
goto func_exit;
}
@@ -1615,7 +1619,7 @@ page_validate(
if (!(1 == cmp_rec_rec(rec, old_rec, index))) {
fprintf(stderr,
"InnoDB: Records in wrong order on page %lu index %s table %s\n",
- buf_frame_get_page_no(page),
+ (ulong) buf_frame_get_page_no(page),
index->name,
index->table_name);
@@ -1658,7 +1662,8 @@ page_validate(
if (rec_get_n_owned(rec) != own_count) {
fprintf(stderr,
"InnoDB: Wrong owned count %lu, %lu, in index %s\n",
- rec_get_n_owned(rec), own_count,
+ (ulong) rec_get_n_owned(rec),
+ (ulong) own_count,
index->name);
goto func_exit;
@@ -1689,7 +1694,7 @@ page_validate(
|| rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) {
fprintf(stderr,
"InnoDB: Next record offset wrong %lu in index %s\n",
- rec_get_next_offs(rec), index->name);
+ (ulong) rec_get_next_offs(rec), index->name);
goto func_exit;
}
@@ -1709,13 +1714,14 @@ page_validate(
if (slot_no != n_slots - 1) {
fprintf(stderr, "InnoDB: n slots wrong %lu %lu in index %s\n",
- slot_no, n_slots - 1, index->name);
+ (ulong) slot_no, (ulong) (n_slots - 1), index->name);
goto func_exit;
}
if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) {
fprintf(stderr, "InnoDB: n recs wrong %lu %lu in index %s\n",
- page_header_get_field(page, PAGE_N_RECS) + 2, count + 1,
+ (ulong) page_header_get_field(page, PAGE_N_RECS) + 2,
+ (ulong) (count + 1),
index->name);
goto func_exit;
@@ -1724,7 +1730,7 @@ page_validate(
if (data_size != page_get_data_size(page)) {
fprintf(stderr,
"InnoDB: Summed data size %lu, returned by func %lu\n",
- data_size, page_get_data_size(page));
+ (ulong) data_size, (ulong) page_get_data_size(page));
goto func_exit;
}
@@ -1760,8 +1766,9 @@ page_validate(
fprintf(stderr,
"InnoDB: N heap is wrong %lu %lu in index %s\n",
- page_header_get_field(page, PAGE_N_HEAP), count + 1,
- index->name);
+ (ulong) page_header_get_field(page, PAGE_N_HEAP),
+ (ulong) count + 1,
+ index->name);
goto func_exit;
}
@@ -1773,7 +1780,7 @@ func_exit:
if (ret == FALSE) {
fprintf(stderr,
"InnoDB: Apparent corruption in page %lu in index %s in table %s\n",
- buf_frame_get_page_no(page), index->name,
+ (ulong) buf_frame_get_page_no(page), index->name,
index->table_name);
buf_page_print(page);
diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c
index ab723cb635c..f014200b2a6 100644
--- a/innobase/pars/lexyy.c
+++ b/innobase/pars/lexyy.c
@@ -1,7 +1,7 @@
/* A lexical scanner generated by flex */
/* Scanner skeleton version:
- * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $
+ * $Header: /home/heikki/cvsroot/ib/pars/lexyy.c,v 1.2 2003/10/30 20:27:19 heikki Exp $
*/
#define FLEX_SCANNER
diff --git a/innobase/pars/pars0opt.c b/innobase/pars/pars0opt.c
index 4faf83b47a3..9b0495a01cd 100644
--- a/innobase/pars/pars0opt.c
+++ b/innobase/pars/pars0opt.c
@@ -1235,7 +1235,8 @@ opt_print_query_plan(
printf(
"Table %s index %s; exact m. %lu, match %lu, end conds %lu\n",
plan->table->name, plan->index->name,
- plan->n_exact_match, n_fields,
- UT_LIST_GET_LEN(plan->end_conds));
+ (unsigned long) plan->n_exact_match,
+ (unsigned long) n_fields,
+ (unsigned long) UT_LIST_GET_LEN(plan->end_conds));
}
}
diff --git a/innobase/que/que0que.c b/innobase/que/que0que.c
index 279f9fc21aa..127e7f84576 100644
--- a/innobase/que/que0que.c
+++ b/innobase/que/que0que.c
@@ -483,7 +483,7 @@ que_graph_free_recursive(
if (thr->magic_n != QUE_THR_MAGIC_N) {
fprintf(stderr,
"que_thr struct appears corrupt; magic n %lu\n",
- thr->magic_n);
+ (unsigned long) thr->magic_n);
mem_analyze_corruption((byte*)thr);
ut_error;
}
@@ -595,7 +595,7 @@ que_graph_free_recursive(
default:
fprintf(stderr,
"que_node struct appears corrupt; type %lu\n",
- que_node_get_type(node));
+ (unsigned long) que_node_get_type(node));
mem_analyze_corruption((byte*)node);
ut_error;
}
@@ -983,7 +983,8 @@ que_thr_move_to_run_state_for_mysql(
{
if (thr->magic_n != QUE_THR_MAGIC_N) {
fprintf(stderr,
- "que_thr struct appears corrupt; magic n %lu\n", thr->magic_n);
+ "que_thr struct appears corrupt; magic n %lu\n",
+ (unsigned long) thr->magic_n);
mem_analyze_corruption((byte*)thr);
@@ -1019,7 +1020,8 @@ que_thr_stop_for_mysql_no_error(
if (thr->magic_n != QUE_THR_MAGIC_N) {
fprintf(stderr,
- "que_thr struct appears corrupt; magic n %lu\n", thr->magic_n);
+ "que_thr struct appears corrupt; magic n %lu\n",
+ (unsigned long) thr->magic_n);
mem_analyze_corruption((byte*)thr);
@@ -1091,7 +1093,8 @@ que_node_print_info(
str = "UNKNOWN NODE TYPE";
}
- fprintf(stderr, "Node type %lu: %s, address %p\n", type, str, node);
+ fprintf(stderr, "Node type %lu: %s, address %lx\n", (unsigned long) type, str,
+ (unsigned long) addr);
}
/**************************************************************************
@@ -1250,10 +1253,6 @@ loop:
mutex_exit(&kernel_mutex);
}
*/
- /* TRUE below denotes that the thread is allowed to own the dictionary
- mutex, though */
- ut_ad(sync_thread_levels_empty_gen(TRUE));
-
loop_count++;
if (next_thr != thr) {
diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c
index 64b6d87283d..889612deef4 100644
--- a/innobase/read/read0read.c
+++ b/innobase/read/read0read.c
@@ -236,16 +236,16 @@ read_view_print(
ulint i;
fprintf(stderr, "Read view low limit trx n:o %lu %lu\n",
- ut_dulint_get_high(view->low_limit_no),
- ut_dulint_get_low(view->low_limit_no));
+ (ulong) ut_dulint_get_high(view->low_limit_no),
+ (ulong) ut_dulint_get_low(view->low_limit_no));
fprintf(stderr, "Read view up limit trx id %lu %lu\n",
- ut_dulint_get_high(view->up_limit_id),
- ut_dulint_get_low(view->up_limit_id));
+ (ulong) ut_dulint_get_high(view->up_limit_id),
+ (ulong) ut_dulint_get_low(view->up_limit_id));
fprintf(stderr, "Read view low limit trx id %lu %lu\n",
- ut_dulint_get_high(view->low_limit_id),
- ut_dulint_get_low(view->low_limit_id));
+ (ulong) ut_dulint_get_high(view->low_limit_id),
+ (ulong) ut_dulint_get_low(view->low_limit_id));
fprintf(stderr, "Read view individually stored trx ids:\n");
@@ -253,7 +253,7 @@ read_view_print(
for (i = 0; i < n_ids; i++) {
fprintf(stderr, "Read view trx id %lu %lu\n",
- ut_dulint_get_high(read_view_get_nth_trx_id(view, i)),
- ut_dulint_get_low(read_view_get_nth_trx_id(view, i)));
+ (ulong) ut_dulint_get_high(read_view_get_nth_trx_id(view, i)),
+ (ulong) ut_dulint_get_low(read_view_get_nth_trx_id(view, i)));
}
}
diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c
index dea2621faf3..254ebeec8c9 100644
--- a/innobase/rem/rem0cmp.c
+++ b/innobase/rem/rem0cmp.c
@@ -61,10 +61,11 @@ must be a copy of the the one in ha_innobase.cc! */
extern
int
innobase_mysql_cmp(
-/*===============*/
+/*===============*/
/* out: 1, 0, -1, if a is greater,
equal, less than b, respectively */
- int mysql_type, /* in: MySQL type */
+ int mysql_type, /* in: MySQL type */
+ uint charset_number, /* in: number of the charset */
unsigned char* a, /* in: data field */
unsigned int a_length, /* in: data field length,
not UNIV_SQL_NULL */
@@ -97,16 +98,28 @@ cmp_types_are_equal(
dtype_t* type1, /* in: type 1 */
dtype_t* type2) /* in: type 2 */
{
- if ((type1->mtype == DATA_VARCHAR && type2->mtype == DATA_CHAR)
- || (type1->mtype == DATA_CHAR && type2->mtype == DATA_VARCHAR)
- || (type1->mtype == DATA_FIXBINARY && type2->mtype == DATA_BINARY)
- || (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY)
- || (type1->mtype == DATA_MYSQL && type2->mtype == DATA_VARMYSQL)
- || (type1->mtype == DATA_VARMYSQL && type2->mtype == DATA_MYSQL)) {
-
- return(TRUE);
+ if (dtype_is_non_binary_string_type(type1->mtype, type1->prtype)
+ && dtype_is_non_binary_string_type(type2->mtype, type2->prtype)) {
+
+ /* Both are non-binary string types: they can be compared if
+ and only if the charset-collation is the same */
+
+ if (dtype_get_charset_coll(type1->prtype)
+ == dtype_get_charset_coll(type2->prtype)) {
+ return(TRUE);
+ }
+
+ return(FALSE);
}
+ if (dtype_is_binary_string_type(type1->mtype, type1->prtype)
+ && dtype_is_binary_string_type(type2->mtype, type2->prtype)) {
+
+ /* Both are binary string types: they can be compared */
+
+ return(TRUE);
+ }
+
if (type1->mtype != type2->mtype) {
return(FALSE);
@@ -128,11 +141,6 @@ cmp_types_are_equal(
return(FALSE);
}
- if (type1->mtype == DATA_BLOB && (type1->prtype & DATA_BINARY_TYPE)
- != (type2->prtype & DATA_BINARY_TYPE)) {
- return(FALSE);
- }
-
return(TRUE);
}
@@ -269,10 +277,12 @@ cmp_whole_field(
return(innobase_mysql_cmp(
(int)(type->prtype & DATA_MYSQL_TYPE_MASK),
+ (uint)dtype_get_charset_coll(type->prtype),
a, a_length, b, b_length));
default:
fprintf(stderr,
- "InnoDB: unknown type number %lu\n", data_type);
+ "InnoDB: unknown type number %lu\n",
+ (ulong) data_type);
ut_error;
}
@@ -321,7 +331,9 @@ cmp_data_data_slow(
if (cur_type->mtype >= DATA_FLOAT
|| (cur_type->mtype == DATA_BLOB
- && (cur_type->prtype & DATA_NONLATIN1))) {
+ && 0 == (cur_type->prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(cur_type->prtype) !=
+ data_mysql_latin1_swedish_charset_coll)) {
return(cmp_whole_field(cur_type, data1, len1, data2, len2));
}
@@ -522,8 +534,10 @@ cmp_dtuple_rec_with_match(
}
if (cur_type->mtype >= DATA_FLOAT
- || (cur_type->mtype == DATA_BLOB
- && (cur_type->prtype & DATA_NONLATIN1))) {
+ || (cur_type->mtype == DATA_BLOB
+ && 0 == (cur_type->prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(cur_type->prtype) !=
+ data_mysql_latin1_swedish_charset_coll)) {
ret = cmp_whole_field(cur_type,
dfield_get_data(dtuple_field), dtuple_f_len,
@@ -844,8 +858,10 @@ cmp_rec_rec_with_match(
}
if (cur_type->mtype >= DATA_FLOAT
- || (cur_type->mtype == DATA_BLOB
- && (cur_type->prtype & DATA_NONLATIN1))) {
+ || (cur_type->mtype == DATA_BLOB
+ && 0 == (cur_type->prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(cur_type->prtype) !=
+ data_mysql_latin1_swedish_charset_coll)) {
ret = cmp_whole_field(cur_type,
rec1_b_ptr, rec1_f_len,
diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c
index fddc8eab761..3d0b997db85 100644
--- a/innobase/rem/rem0rec.c
+++ b/innobase/rem/rem0rec.c
@@ -107,7 +107,7 @@ rec_get_nth_field(
if (n > 1024) {
fprintf(stderr, "Error: trying to access field %lu in rec\n",
- n);
+ (ulong) n);
ut_error;
}
@@ -474,7 +474,7 @@ rec_validate(
if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
- n_fields);
+ (ulong) n_fields);
return(FALSE);
}
@@ -483,8 +483,8 @@ rec_validate(
if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
fprintf(stderr,
- "InnoDB: Error: record field %lu len %lu\n", i,
- len);
+ "InnoDB: Error: record field %lu len %lu\n", (ulong) i,
+ (ulong) len);
return(FALSE);
}
@@ -502,7 +502,8 @@ rec_validate(
if (len_sum != (ulint)(rec_get_end(rec) - rec)) {
fprintf(stderr,
"InnoDB: Error: record len should be %lu, len %lu\n",
- len_sum, (ulint)(rec_get_end(rec) - rec));
+ (ulong) len_sum,
+ (ulong) (rec_get_end(rec) - rec));
return(FALSE);
}
@@ -537,13 +538,13 @@ rec_print(
printf(
"PHYSICAL RECORD: n_fields %lu; 1-byte offs %s; info bits %lu\n",
- n, offs, rec_get_info_bits(rec));
+ (ulong) n, offs, (ulong) rec_get_info_bits(rec));
for (i = 0; i < n; i++) {
data = rec_get_nth_field(rec, i, &len);
- printf(" %lu:", i);
+ printf(" %lu:", (ulong) i);
if (len != UNIV_SQL_NULL) {
if (len <= 30) {
@@ -556,7 +557,7 @@ rec_print(
}
} else {
printf(" SQL NULL, size %lu ",
- rec_get_nth_field_size(rec, i));
+ (ulong) rec_get_nth_field_size(rec, i));
}
printf(";");
@@ -594,7 +595,8 @@ rec_sprintf(
return(k);
}
- k += sprintf(buf + k, "RECORD: info bits %lu", rec_get_info_bits(rec));
+ k += sprintf(buf + k, "RECORD: info bits %lu",
+ (ulong) rec_get_info_bits(rec));
for (i = 0; i < n; i++) {
@@ -605,7 +607,7 @@ rec_sprintf(
data = rec_get_nth_field(rec, i, &len);
- k += sprintf(buf + k, " %lu:", i);
+ k += sprintf(buf + k, " %lu:", (ulong) i);
if (len != UNIV_SQL_NULL) {
if (k + 30 + 5 * len > buf_len) {
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
index 84968ea4e20..fc1f7a19d53 100644
--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -1683,6 +1683,7 @@ row_ins_index_entry_low(
ulint modify = 0; /* remove warning */
rec_t* insert_rec;
rec_t* rec;
+ rec_t* first_rec;
ulint err;
ulint n_unique;
big_rec_t* big_rec = NULL;
@@ -1715,6 +1716,14 @@ row_ins_index_entry_low(
goto function_exit;
}
+ first_rec = page_rec_get_next(page_get_infimum_rec(
+ buf_frame_align(btr_cur_get_rec(&cursor))));
+
+ if (!page_rec_is_supremum(first_rec)) {
+ ut_a((rec_get_n_fields(first_rec))
+ == dtuple_get_n_fields(entry));
+ }
+
n_unique = dict_index_get_n_unique(index);
if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
index ab73dc2ad6d..efcca623a76 100644
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -22,12 +22,15 @@ Created 9/17/2000 Heikki Tuuri
#include "dict0dict.h"
#include "dict0crea.h"
#include "dict0load.h"
+#include "dict0boot.h"
#include "trx0roll.h"
#include "trx0purge.h"
#include "lock0lock.h"
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
/* A dummy variable used to fool the compiler */
ibool row_mysql_identically_false = FALSE;
@@ -59,6 +62,19 @@ row_mysql_read_var_ref_noninline(
}
/***********************************************************************
+Frees the blob heap in prebuilt when no longer needed. */
+
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct of a
+ ha_innobase:: table handle */
+{
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+}
+
+/***********************************************************************
Stores a reference to a BLOB in the MySQL format. */
void
@@ -269,7 +285,8 @@ handle_new_error:
"InnoDB: http://www.innodb.com/ibman.html for help.\n");
} else {
- fprintf(stderr, "InnoDB: unknown error code %lu\n", err);
+ fprintf(stderr, "InnoDB: unknown error code %lu\n",
+ (ulong) err);
ut_error;
}
@@ -379,7 +396,9 @@ row_prebuilt_free(
fprintf(stderr,
"InnoDB: Error: trying to free a corrupt\n"
"InnoDB: table handle. Magic n %lu, magic n2 %lu, table name %s\n",
- prebuilt->magic_n, prebuilt->magic_n2, prebuilt->table->name);
+ (ulong) prebuilt->magic_n,
+ (ulong) prebuilt->magic_n2,
+ prebuilt->table->name);
mem_analyze_corruption((byte*)prebuilt);
@@ -459,7 +478,7 @@ row_update_prebuilt_trx(
fprintf(stderr,
"InnoDB: Error: trying to use a corrupt\n"
"InnoDB: trx handle. Magic n %lu\n",
- trx->magic_n);
+ (ulong) trx->magic_n);
mem_analyze_corruption((byte*)trx);
@@ -470,7 +489,7 @@ row_update_prebuilt_trx(
fprintf(stderr,
"InnoDB: Error: trying to use a corrupt\n"
"InnoDB: table handle. Magic n %lu, table name %s\n",
- prebuilt->magic_n, prebuilt->table->name);
+ (ulong) prebuilt->magic_n, prebuilt->table->name);
mem_analyze_corruption((byte*)prebuilt);
@@ -697,7 +716,7 @@ row_insert_for_mysql(
fprintf(stderr,
"InnoDB: Error: trying to free a corrupt\n"
"InnoDB: table handle. Magic n %lu, table name %s\n",
- prebuilt->magic_n, prebuilt->table->name);
+ (ulong) prebuilt->magic_n, prebuilt->table->name);
mem_analyze_corruption((byte*)prebuilt);
@@ -913,7 +932,7 @@ row_update_for_mysql(
fprintf(stderr,
"InnoDB: Error: trying to free a corrupt\n"
"InnoDB: table handle. Magic n %lu, table name %s\n",
- prebuilt->magic_n, prebuilt->table->name);
+ (ulong) prebuilt->magic_n, prebuilt->table->name);
mem_analyze_corruption((byte*)prebuilt);
@@ -1148,7 +1167,9 @@ row_mysql_recover_tmp_table(
trx_t* trx) /* in: transaction handle */
{
char* ptr;
- char old_name[1000];
+ char old_name[OS_FILE_MAX_PATH];
+
+ ut_a(ut_strlen(table->name) + 10 < OS_FILE_MAX_PATH);
ut_memcpy(old_name, table->name, ut_strlen(table->name) + 1);
@@ -1217,7 +1238,8 @@ row_mysql_lock_data_dictionary(
/*===========================*/
trx_t* trx) /* in: transaction */
{
- ut_a(trx->dict_operation_lock_mode == 0);
+ ut_a(trx->dict_operation_lock_mode == 0
+ || trx->dict_operation_lock_mode == RW_X_LATCH);
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks or lock waits can occur then in these operations */
@@ -1421,9 +1443,8 @@ row_create_table_for_mysql(
"InnoDB: Warning: cannot create table %s because tablespace full\n",
table->name);
row_drop_table_for_mysql(table->name, trx);
- } else {
- ut_a(err == DB_DUPLICATE_KEY);
+ } else if (err == DB_DUPLICATE_KEY) {
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -1439,9 +1460,12 @@ row_create_table_for_mysql(
"InnoDB: database and moving the .frm file to the current database.\n"
"InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
"InnoDB: succeed.\n"
- "InnoDB: You can look further help from section 15.1 of\n"
+ "InnoDB: You can look for further help from section 15.1 of\n"
"InnoDB: http://www.innodb.com/ibman.html\n");
}
+
+ /* We may also get err == DB_ERROR if the .ibd file for the
+ table already exists */
trx->error_state = DB_SUCCESS;
}
@@ -1482,7 +1506,7 @@ row_create_index_for_mysql(
trx->op_info = (char *) "creating index";
/* Check that the same column does not appear twice in the index.
- Starting from 4.0.14 InnoDB should be able to cope with that, but
+ Starting from 4.0.14, InnoDB should be able to cope with that, but
safer not to allow them. */
for (i = 0; i < dict_index_get_n_fields(index); i++) {
@@ -1524,6 +1548,9 @@ row_create_index_for_mysql(
trx->dict_operation = TRUE;
+ /* Note that the space id where we store the index is inherited from
+ the table in dict_build_index_def_step() in dict0crea.c. */
+
node = ind_create_graph_create(index, heap);
thr = pars_complete_graph_for_exec(node, trx, heap);
@@ -1536,7 +1563,6 @@ row_create_index_for_mysql(
que_graph_free((que_t*) que_node_get_parent(thr));
error_handling:
-
if (err != DB_SUCCESS) {
/* We have special error handling here */
@@ -1801,6 +1827,261 @@ row_add_table_to_background_drop_list(
}
/*************************************************************************
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+
+How do we prevent crashes caused by ongoing operations on the table? Old
+operations could try to access non-existent pages.
+
+1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock
+on the table before we can do DISCARD TABLESPACE. Then there are no running
+queries on the table.
+2) Purge and rollback: we assign a new table id for the table. Since purge and
+rollback look for the table based on the table id, they see the table as
+'dropped' and discard their operations.
+3) Insert buffer: we remove all entries for the tablespace in the insert
+buffer tree; as long as the tablespace mem object does not exist, ongoing
+insert buffer page merges are discarded in buf0rea.c. If we recreate the
+tablespace mem object with IMPORT TABLESPACE later, then the tablespace will
+have the same id, but the tablespace_version field in the mem object is
+different, and ongoing old insert buffer page merges get discarded.
+4) Linear readahead and random readahead: we use the same method as in 3) to
+discard ongoing operations. */
+
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dulint new_id;
+ dict_table_t* table;
+ que_thr_t* thr;
+ que_t* graph = NULL;
+ ibool success;
+ ulint err;
+ char buf[2 * OS_FILE_MAX_PATH];
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = (char *) "discarding tablespace";
+ trx_start_if_not_started(trx);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: table %s\n"
+"InnoDB: is in the system tablespace 0 which cannot be discarded\n", name);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID);
+
+ sprintf(buf,
+ "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+ "old_id CHAR;\n"
+ "new_id CHAR;\n"
+ "new_id_low INT;\n"
+ "new_id_high INT;\n"
+ "table_name CHAR;\n"
+ "BEGIN\n"
+ "table_name :='%s';\n"
+ "new_id_high := %lu;\n"
+ "new_id_low := %lu;\n"
+ "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n"
+ "SELECT ID INTO old_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = table_name;\n"
+ "IF (SQL %% NOTFOUND) THEN\n"
+ " COMMIT WORK;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "UPDATE SYS_TABLES SET ID = new_id\n"
+ "WHERE ID = old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n"
+ "WHERE TABLE_ID = old_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n", name, (ulong) ut_dulint_get_high(new_id),
+ (ulong) ut_dulint_get_low(new_id));
+
+ ut_a(strlen(buf) < 2 * OS_FILE_MAX_PATH);
+
+ graph = pars_sql(buf);
+
+ ut_a(graph);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ } else {
+ dict_table_change_id_in_cache(table, new_id);
+
+ success = fil_discard_tablespace(table->space);
+
+ if (!success) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+
+ err = DB_ERROR;
+ } else {
+ /* Set the flag which tells that now it is legal to
+ IMPORT a tablespace for this table */
+ table->tablespace_discarded = TRUE;
+ table->ibd_file_missing = TRUE;
+ }
+ }
+funct_exit:
+ row_mysql_unlock_data_dictionary(trx);
+
+ if (graph) {
+ que_graph_free(graph);
+ }
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = (char *) "";
+
+ return((int) err);
+}
+
+/*********************************************************************
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary. */
+
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ ibool success;
+ dulint current_lsn;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx_start_if_not_started(trx);
+
+ trx->op_info = (char*) "importing tablespace";
+
+ current_lsn = log_get_lsn();
+
+ /* It is possible, though very improbable, that the lsn's in the
+ tablespace to be imported have risen above the current system lsn, if
+ a lengthy purge, ibuf merge, or rollback was performed on a backup
+ taken with ibbackup. If that is the case, reset page lsn's in the
+ file. We assume that mysqld was shut down after it performed these
+ cleanup operations on the .ibd file, so that it stamped the latest lsn
+ to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+ TODO: reset also the trx id's in clustered index records and write
+ a new space id to each data page. That would allow us to import clean
+ .ibd files from another MySQL installation. */
+
+ success = fil_reset_too_high_lsns(name, current_lsn);
+
+ if (!success) {
+ err = DB_ERROR;
+
+ row_mysql_lock_data_dictionary(trx);
+
+ goto funct_exit;
+ }
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: table %s\n"
+"InnoDB: is in the system tablespace 0 which cannot be imported\n", name);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ if (!table->tablespace_discarded) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: you are trying to IMPORT a tablespace\n"
+"InnoDB: %s, though you have not called DISCARD on it yet\n"
+"InnoDB: during the lifetime of the mysqld process!\n", name);
+
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Play safe and remove all insert buffer entries, though we should
+ have removed them already when DISCARD TABLESPACE was called */
+
+ ibuf_delete_for_discarded_space(table->space);
+
+ success = fil_open_single_table_tablespace(table->space, table->name);
+
+ if (success) {
+ table->ibd_file_missing = FALSE;
+ table->tablespace_discarded = FALSE;
+ } else {
+ err = DB_ERROR;
+ }
+
+funct_exit:
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_commit_for_mysql(trx);
+
+ trx->op_info = (char *) "";
+
+ return((int) err);
+}
+
+/*************************************************************************
Drops a table for MySQL. If the name of the dropped table ends to
characters INNODB_MONITOR, then this also stops printing of monitor
output by the master thread. */
@@ -1808,12 +2089,13 @@ output by the master thread. */
int
row_drop_table_for_mysql(
/*=====================*/
- /* out: error code or DB_SUCCESS */
- char* name, /* in: table name */
- trx_t* trx) /* in: transaction handle */
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx) /* in: transaction handle */
{
dict_foreign_t* foreign;
dict_table_t* table;
+ ulint space_id;
que_thr_t* thr;
que_t* graph;
ulint err;
@@ -1822,8 +2104,9 @@ row_drop_table_for_mysql(
ulint len;
ulint namelen;
ulint keywordlen;
+ ibool success;
ibool locked_dictionary = FALSE;
- char buf[10000];
+ char buf[OS_FILE_MAX_PATH + 2000];
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_a(name != NULL);
@@ -1964,6 +2247,8 @@ row_drop_table_for_mysql(
ut_memcpy(buf + len, str2, ut_strlen(str2) + 1);
+ ut_a(strlen(buf) < OS_FILE_MAX_PATH + 2000);
+
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
@@ -1997,11 +2282,12 @@ row_drop_table_for_mysql(
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Error: table %s does not exist in the InnoDB internal\n"
+ " InnoDB: Error: table %s\n"
+ "InnoDB: does not exist in the InnoDB internal\n"
"InnoDB: data dictionary though MySQL is trying to drop it.\n"
"InnoDB: Have you copied the .frm file of the table to the\n"
"InnoDB: MySQL database directory from another database?\n"
- "InnoDB: You can look further help from section 15.1 of\n"
+ "InnoDB: You can look for further help from section 15.1 of\n"
"InnoDB: http://www.innodb.com/ibman.html\n",
name);
goto funct_exit;
@@ -2093,13 +2379,37 @@ row_drop_table_for_mysql(
ut_error;
} else {
+ space_id = table->space;
dict_table_remove_from_cache(table);
if (dict_load_table(name) != NULL) {
ut_print_timestamp(stderr);
fprintf(stderr,
-" InnoDB: Error: dropping of table %s failed!\n", name);
+" InnoDB: Error: not able to remove table %s from the dictionary cache!\n",
+ name);
+ err = DB_ERROR;
+ }
+
+ /* Do not drop possible .ibd tablespace if something went
+ wrong: we do not want to delete valuable data of the user */
+
+ if (err == DB_SUCCESS && space_id > 0) {
+ if (!fil_space_for_table_exists_in_mem(space_id, name,
+ FALSE, TRUE)) {
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ success = fil_delete_tablespace(space_id);
+ if (!success) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: not able to delete tablespace %lu of table %s!\n",
+ (ulong) space_id, name);
+ err = DB_ERROR;
+ }
}
}
funct_exit:
@@ -2176,7 +2486,7 @@ loop:
if (err != DB_SUCCESS) {
fprintf(stderr,
"InnoDB: DROP DATABASE %s failed with error %lu for table %s\n",
- name, (ulint)err, table_name);
+ name, (ulong) err, table_name);
break;
}
}
@@ -2233,13 +2543,14 @@ row_rename_table_for_mysql(
mem_heap_t* heap = NULL;
char** constraints_to_drop = NULL;
ulint n_constraints_to_drop = 0;
- ibool recovering_temp_table = FALSE;
- ulint namelen;
- ulint keywordlen;
+ ibool recovering_temp_table = FALSE;
+ ulint namelen;
+ ulint keywordlen;
ulint len;
ulint i;
char* db_name;
- char buf[10000];
+ ibool success;
+ char buf[2 * OS_FILE_MAX_PATH];
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_a(old_name != NULL);
@@ -2273,21 +2584,21 @@ row_rename_table_for_mysql(
trx->op_info = (char *) "renaming table";
trx_start_if_not_started(trx);
- namelen = ut_strlen(new_name);
+ namelen = ut_strlen(new_name);
- keywordlen = ut_strlen("_recover_innodb_tmp_table");
+ keywordlen = ut_strlen("_recover_innodb_tmp_table");
- if (namelen >= keywordlen
- && 0 == ut_memcmp(new_name + namelen - keywordlen,
- (char*)"_recover_innodb_tmp_table", keywordlen)) {
+ if (namelen >= keywordlen
+ && 0 == ut_memcmp(new_name + namelen - keywordlen,
+ (char*)"_recover_innodb_tmp_table", keywordlen)) {
- recovering_temp_table = TRUE;
- }
+ recovering_temp_table = TRUE;
+ }
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
- if (!recovering_temp_table) {
+ if (!recovering_temp_table) {
row_mysql_lock_data_dictionary(trx);
}
@@ -2295,7 +2606,30 @@ row_rename_table_for_mysql(
if (!table) {
err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: table %s\n"
+ "InnoDB: does not exist in the InnoDB internal\n"
+ "InnoDB: data dictionary though MySQL is trying to rename the table.\n"
+ "InnoDB: Have you copied the .frm file of the table to the\n"
+ "InnoDB: MySQL database directory from another database?\n"
+ "InnoDB: You can look for further help from section 15.1 of\n"
+ "InnoDB: http://www.innodb.com/ibman.html\n",
+ old_name);
+ goto funct_exit;
+ }
+
+ if (table->ibd_file_missing) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: table %s\n"
+ "InnoDB: does not have an .ibd file in the database directory.\n"
+ "InnoDB: You can look for further help from section 15.1 of\n"
+ "InnoDB: http://www.innodb.com/ibman.html\n",
+ old_name);
goto funct_exit;
}
@@ -2449,6 +2783,8 @@ row_rename_table_for_mysql(
ut_memcpy(buf + len, str3, ut_strlen(str3) + 1);
+ ut_a(strlen(buf) < 2 * OS_FILE_MAX_PATH);
+
graph = pars_sql(buf);
ut_a(graph);
@@ -2467,20 +2803,17 @@ row_rename_table_for_mysql(
if (err != DB_SUCCESS) {
if (err == DB_DUPLICATE_KEY) {
ut_print_timestamp(stderr);
-
fprintf(stderr,
" InnoDB: Error: table %s exists in the InnoDB internal data\n"
"InnoDB: dictionary though MySQL is trying rename table %s to it.\n"
"InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n"
- "InnoDB: You can look further help from section 15.1 of\n"
+ "InnoDB: You can look for further help from section 15.1 of\n"
"InnoDB: http://www.innodb.com/ibman.html\n",
new_name, old_name);
-
fprintf(stderr,
"InnoDB: If table %s is a temporary table #sql..., then it can be that\n"
"InnoDB: there are still queries running on the table, and it will be\n"
"InnoDB: dropped automatically when the queries end.\n", new_name);
-
fprintf(stderr,
"InnoDB: You can drop the orphaned table inside InnoDB by\n"
"InnoDB: creating an InnoDB table with the same name in another\n"
@@ -2488,13 +2821,27 @@ row_rename_table_for_mysql(
"InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n"
"InnoDB: succeed.\n");
}
-
trx->error_state = DB_SUCCESS;
trx_general_rollback_for_mysql(trx, FALSE, NULL);
trx->error_state = DB_SUCCESS;
} else {
- ut_a(dict_table_rename_in_cache(table, new_name,
- !row_is_mysql_tmp_table_name(new_name)));
+ /* The following call will also rename the .ibd data file if
+ the table is stored in a single-table tablespace */
+
+ success = dict_table_rename_in_cache(table, new_name,
+ !row_is_mysql_tmp_table_name(new_name));
+ if (!success) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, FALSE, NULL);
+ trx->error_state = DB_SUCCESS;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error in table rename, cannot rename %s to %s\n", old_name,
+ new_name);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
if (row_is_mysql_tmp_table_name(old_name)) {
@@ -2508,18 +2855,14 @@ row_rename_table_for_mysql(
err = dict_load_foreigns(new_name);
if (err != DB_SUCCESS) {
-
ut_print_timestamp(stderr);
-
fprintf(stderr,
" InnoDB: Error: in ALTER TABLE table %s\n"
"InnoDB: has or is referenced in foreign key constraints\n"
"InnoDB: which are not compatible with the new table definition.\n",
new_name);
-
ut_a(dict_table_rename_in_cache(table,
old_name, FALSE));
-
trx->error_state = DB_SUCCESS;
trx_general_rollback_for_mysql(trx, FALSE,
NULL);
@@ -2527,8 +2870,8 @@ row_rename_table_for_mysql(
}
}
}
-funct_exit:
- if (!recovering_temp_table) {
+funct_exit:
+ if (!recovering_temp_table) {
row_mysql_unlock_data_dictionary(trx);
}
@@ -2687,7 +3030,7 @@ row_check_table_for_mysql(
ulint n_rows_in_table = ULINT_UNDEFINED;
ulint ret = DB_SUCCESS;
ulint old_isolation_level;
-
+
prebuilt->trx->op_info = (char *) "checking table";
old_isolation_level = prebuilt->trx->isolation_level;
@@ -2723,7 +3066,8 @@ row_check_table_for_mysql(
fprintf(stderr,
"Error: index %s contains %lu entries, should be %lu\n",
- index->name, n_rows, n_rows_in_table);
+ index->name, (ulong) n_rows,
+ (ulong) n_rows_in_table);
}
}
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
index bd3742ad589..a409b64f8e4 100644
--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@@ -531,6 +531,16 @@ row_purge_parse_undo_rec(
return(FALSE);
}
+ if (node->table->ibd_file_missing) {
+ /* We skip purge of missing .ibd files */
+
+ node->table = NULL;
+
+ row_mysql_unfreeze_data_dictionary(trx);
+
+ return(FALSE);
+ }
+
clust_index = dict_table_get_first_index(node->table);
if (clust_index == NULL) {
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
index 6ae4f791205..4f70cea2058 100644
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@@ -1756,7 +1756,7 @@ row_sel_step(
return(NULL);
} else {
/* SQL error detected */
- printf("SQL error %lu\n", err);
+ printf("SQL error %lu\n", (ulong) err);
que_thr_handle_error(thr, DB_ERROR, NULL, 0);
@@ -1806,7 +1806,7 @@ fetch_step(
if (sel_node->state == SEL_NODE_CLOSED) {
/* SQL error detected */
- printf("SQL error %lu\n", (ulint)DB_ERROR);
+ printf("SQL error %lu\n", (ulong) DB_ERROR);
que_thr_handle_error(thr, DB_ERROR, NULL, 0);
@@ -1903,6 +1903,7 @@ row_sel_convert_mysql_key_to_innobase(
ulint key_len) /* in: MySQL key value length */
{
byte* original_buf = buf;
+ byte* original_key_ptr = key_ptr;
dict_field_t* field;
dfield_t* dfield;
ulint data_offset;
@@ -2026,7 +2027,16 @@ row_sel_convert_mysql_key_to_innobase(
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Warning: using a partial-field key prefix in search\n");
+ " InnoDB: Warning: using a partial-field key prefix in search.\n"
+ "InnoDB: Table name %s, index name %s. Last data field length %lu bytes,\n"
+ "InnoDB: key ptr now exceeds key end by %lu bytes.\n"
+ "InnoDB: Key value in the MySQL format:\n", index->table_name, index->name,
+ (ulong) data_field_len,
+ (ulong) (key_ptr - key_end));
+ fflush(stderr);
+ ut_print_buf(original_key_ptr, key_len);
+ fflush(stdout);
+ fprintf(stderr, "\n");
if (!is_null) {
dfield->len -= (ulint)(key_ptr - key_end);
@@ -2069,8 +2079,8 @@ row_sel_store_row_id_to_prebuilt(
fprintf(stderr,
"InnoDB: Error: Row id field is wrong length %lu in table %s index %s\n"
"InnoDB: Field number %lu, record:\n%s\n",
- len, index->table_name, index->name,
- dict_index_get_sys_col_pos(index, DATA_ROW_ID),
+ (ulong) len, index->table_name, index->name,
+ (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID),
err_buf);
ut_error;
}
@@ -2150,9 +2160,13 @@ Note that the template in prebuilt may advise us to copy only a few
columns to mysql_rec, other columns are left blank. All columns may not
be needed in the query. */
static
-void
+ibool
row_sel_store_mysql_rec(
/*====================*/
+ /* out: TRUE if success, FALSE if
+ could not allocate memory for a BLOB
+ (though we may also assert in that
+ case) */
byte* mysql_rec, /* out: row in the MySQL format */
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
rec_t* rec) /* in: Innobase record in the index
@@ -2164,6 +2178,7 @@ row_sel_store_mysql_rec(
byte* data;
ulint len;
byte* blob_buf;
+ int pad_char;
ulint i;
ut_ad(prebuilt->mysql_template);
@@ -2173,9 +2188,10 @@ row_sel_store_mysql_rec(
prebuilt->blob_heap = NULL;
}
- /* Mark all columns as not SQL NULL */
+ /* MySQL assumes that all columns have the SQL NULL bit set unless it
+ is a nullable column with a non-NULL value */
- memset(mysql_rec, '\0', prebuilt->null_bitmap_len);
+ memset(mysql_rec, 0xFF, prebuilt->null_bitmap_len);
for (i = 0; i < prebuilt->n_template; i++) {
@@ -2192,6 +2208,10 @@ row_sel_store_mysql_rec(
extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE);
+ /* NOTE: if we are retrieving a big BLOB, we may
+ already run out of memory in the next call, which
+ causes an assert */
+
data = btr_rec_copy_externally_stored_field(rec,
templ->rec_field_no, &len,
extern_field_heap);
@@ -2203,9 +2223,29 @@ row_sel_store_mysql_rec(
if (templ->type == DATA_BLOB) {
ut_a(prebuilt->templ_contains_blob);
-
- /* Copy the BLOB data to the BLOB
- heap of prebuilt */
+
+ /* A heuristic test that we can allocate the
+ memory for a big BLOB. We have a safety margin
+ of 1000000 bytes. Since the test takes some
+ CPU time, we do not use it for small BLOBs. */
+
+ if (len > 2000000
+ && !ut_test_malloc(len + 1000000)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n"
+"InnoDB: a big column. Table name %s\n", (ulong) len, prebuilt->table->name);
+
+ if (extern_field_heap) {
+ mem_heap_free(
+ extern_field_heap);
+ }
+ return(FALSE);
+ }
+
+ /* Copy the BLOB data to the BLOB heap of
+ prebuilt */
if (prebuilt->blob_heap == NULL) {
prebuilt->blob_heap =
@@ -2223,31 +2263,46 @@ row_sel_store_mysql_rec(
mysql_rec + templ->mysql_col_offset,
templ->mysql_col_len, data, len,
templ->type, templ->is_unsigned);
-
+
+ /* Cleanup */
if (extern_field_heap) {
mem_heap_free(extern_field_heap);
extern_field_heap = NULL;
}
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a non-NULL
+ value */
+ mysql_rec[templ->mysql_null_byte_offset] &=
+ ~(byte) (templ->mysql_null_bit_mask);
+ }
} else {
/* MySQL seems to assume the field for an SQL NULL
- value is set to zero. Not taking this into account
- caused seg faults with NULL BLOB fields, and
+ value is set to zero or space. Not taking this into
+ account caused seg faults with NULL BLOB fields, and
bug number 154 in the MySQL bug database: GROUP BY
and DISTINCT could treat NULL values inequal. */
- memset(mysql_rec + templ->mysql_col_offset, '\0',
- templ->mysql_col_len);
-
- if (!templ->mysql_null_bit_mask) {
- fprintf(stderr,
-"InnoDB: Error: trying to return an SQL NULL field in a non-null\n"
-"innoDB: column! Table name %s\n", prebuilt->table->name);
+ if (templ->type == DATA_VARCHAR
+ || templ->type == DATA_CHAR
+ || templ->type == DATA_BINARY
+ || templ->type == DATA_FIXBINARY
+ || templ->type == DATA_MYSQL
+ || templ->type == DATA_VARMYSQL) {
+ /* MySQL pads all non-BLOB and non-TEXT
+ string types with space ' ' */
+
+ pad_char = ' ';
} else {
- mysql_rec[templ->mysql_null_byte_offset] |=
- (byte) (templ->mysql_null_bit_mask);
+ pad_char = '\0';
}
+
+ memset(mysql_rec + templ->mysql_col_offset, pad_char,
+ templ->mysql_col_len);
}
}
+
+ return(TRUE);
}
/*************************************************************************
@@ -2572,9 +2627,9 @@ row_sel_push_cache_row_for_mysql(
ut_ad(prebuilt->fetch_cache_first == 0);
- row_sel_store_mysql_rec(
+ ut_a(row_sel_store_mysql_rec(
prebuilt->fetch_cache[prebuilt->n_fetch_cached],
- prebuilt, rec);
+ prebuilt, rec));
prebuilt->n_fetch_cached++;
}
@@ -2656,7 +2711,8 @@ row_search_for_mysql(
/*=================*/
/* out: DB_SUCCESS,
DB_RECORD_NOT_FOUND,
- DB_END_OF_INDEX, or DB_DEADLOCK */
+ DB_END_OF_INDEX, DB_DEADLOCK,
+ or DB_TOO_BIG_RECORD */
byte* buf, /* in/out: buffer for the fetched
row in the MySQL format */
ulint mode, /* in: search mode PAGE_CUR_L, ... */
@@ -2712,13 +2768,25 @@ row_search_for_mysql(
fprintf(stderr,
"InnoDB: Error: trying to free a corrupt\n"
"InnoDB: table handle. Magic n %lu, table name %s\n",
- prebuilt->magic_n, prebuilt->table->name);
+ (ulong) prebuilt->magic_n, prebuilt->table->name);
mem_analyze_corruption((byte*)prebuilt);
ut_error;
}
+ if (trx->n_mysql_tables_in_use == 0) {
+ char err_buf[1000];
+
+ trx_print(err_buf, trx);
+
+ fprintf(stderr,
+"InnoDB: Error: MySQL is trying to perform a SELECT\n"
+"InnoDB: but it has not locked any tables in ::external_lock()!\n%s\n",
+ err_buf);
+ ut_a(0);
+ }
+
/* printf("Match mode %lu\n search tuple ", match_mode);
dtuple_print(search_tuple);
@@ -2836,10 +2904,10 @@ row_search_for_mysql(
retrieve also a second row if a primary key contains more than
1 column. Return immediately if this is not a HANDLER
command. */
-
+
if (direction != 0 && !prebuilt->used_in_HANDLER) {
-
- trx->op_info = (char*)"";
+
+ trx->op_info = (char *) "";
return(DB_RECORD_NOT_FOUND);
}
}
@@ -2895,7 +2963,14 @@ row_search_for_mysql(
#ifdef UNIV_SEARCH_DEBUG
ut_a(0 == cmp_dtuple_rec(search_tuple, rec));
#endif
- row_sel_store_mysql_rec(buf, prebuilt, rec);
+ if (!row_sel_store_mysql_rec(buf, prebuilt,
+ rec)) {
+ err = DB_TOO_BIG_RECORD;
+
+ /* We let the main loop to do the
+ error handling */
+ goto shortcut_fails_too_big_rec;
+ }
mtr_commit(&mtr);
@@ -2942,7 +3017,7 @@ row_search_for_mysql(
return(DB_RECORD_NOT_FOUND);
}
-
+shortcut_fails_too_big_rec:
mtr_commit(&mtr);
mtr_start(&mtr);
}
@@ -3018,6 +3093,18 @@ row_search_for_mysql(
if (!prebuilt->sql_stat_start) {
/* No need to set an intention lock or assign a read view */
+ if (trx->read_view == NULL
+ && prebuilt->select_lock_type == LOCK_NONE) {
+ char err_buf[1000];
+
+ trx_print(err_buf, trx);
+
+ fprintf(stderr,
+"InnoDB: Error: MySQL is trying to perform a consistent read\n"
+"InnoDB: but the read view is not assigned!\n%s\n", err_buf);
+
+ ut_a(0);
+ }
} else if (prebuilt->select_lock_type == LOCK_NONE) {
/* This is a consistent read */
/* Assign a read view for the query */
@@ -3093,8 +3180,9 @@ rec_loop:
" InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
"InnoDB: index %s, table %s. Run CHECK TABLE to table. You may need to\n"
"InnoDB: restore from a backup, or dump + drop + reimport the table.\n",
- (ulint)(rec - buf_frame_align(rec)), next_offs,
- buf_frame_get_page_no(rec), index->name,
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec), index->name,
index->table_name);
err = DB_CORRUPTION;
@@ -3107,8 +3195,9 @@ rec_loop:
fprintf(stderr,
"InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n"
"InnoDB: index %s, table %s. We try to skip the rest of the page.\n",
- (ulint)(rec - buf_frame_align(rec)), next_offs,
- buf_frame_get_page_no(rec), index->name,
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec), index->name,
index->table_name);
btr_pcur_move_to_last_on_page(pcur, &mtr);
@@ -3123,8 +3212,9 @@ rec_loop:
fprintf(stderr,
"InnoDB: Index record corruption: rec offs %lu next offs %lu, page no %lu,\n"
"InnoDB: index %s, table %s. We try to skip the record.\n",
- (ulint)(rec - buf_frame_align(rec)), next_offs,
- buf_frame_get_page_no(rec), index->name,
+ (ulong) (rec - buf_frame_align(rec)),
+ (ulong) next_offs,
+ (ulong) buf_frame_get_page_no(rec), index->name,
index->table_name);
goto next_rec;
@@ -3351,7 +3441,11 @@ rec_loop:
rec_get_size(rec));
mach_write_to_4(buf, rec_get_extra_size(rec) + 4);
} else {
- row_sel_store_mysql_rec(buf, prebuilt, rec);
+ if (!row_sel_store_mysql_rec(buf, prebuilt, rec)) {
+ err = DB_TOO_BIG_RECORD;
+
+ goto lock_wait_or_error;
+ }
}
if (prebuilt->clust_index_was_generated) {
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
index fff67dcd627..08f0e29c839 100644
--- a/innobase/row/row0uins.c
+++ b/innobase/row/row0uins.c
@@ -258,6 +258,13 @@ row_undo_ins_parse_undo_rec(
return;
}
+ if (node->table->ibd_file_missing) {
+ /* We skip undo operations to missing .ibd files */
+ node->table = NULL;
+
+ return;
+ }
+
clust_index = dict_table_get_first_index(node->table);
ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c
index 34c3aaf9142..1bfd71f8c64 100644
--- a/innobase/row/row0umod.c
+++ b/innobase/row/row0umod.c
@@ -708,6 +708,13 @@ row_undo_mod_parse_undo_rec(
return;
}
+ if (node->table->ibd_file_missing) {
+ /* We skip undo operations to missing .ibd files */
+ node->table = NULL;
+
+ return;
+ }
+
clust_index = dict_table_get_first_index(node->table);
ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
index 78cfe70c260..613d0a3b890 100644
--- a/innobase/row/row0undo.c
+++ b/innobase/row/row0undo.c
@@ -323,7 +323,8 @@ row_undo_step(
if (err != DB_SUCCESS) {
/* SQL error detected */
- fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", err);
+ fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+ (ulong) err);
if (err == DB_OUT_OF_FILE_SPACE) {
fprintf(stderr,
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
index ad6542845cb..f8739b65c2f 100644
--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -1605,7 +1605,8 @@ row_upd_clust_step(
then we have to free the file segments of the index tree associated
with the index */
- if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+ if (node->is_delete
+ && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index 0be0ab957af..838e63b3e25 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -72,6 +72,10 @@ names, where the file name itself may also contain a path */
char* srv_data_home = NULL;
char* srv_arch_dir = NULL;
+ibool srv_file_per_table = FALSE; /* store to its own file each table
+ created by an user; data dictionary
+ tables are in the system tablespace
+ 0 */
ulint srv_n_data_files = 0;
char** srv_data_file_names = NULL;
ulint* srv_data_file_sizes = NULL; /* size in database pages */
@@ -95,7 +99,7 @@ char** srv_log_group_home_dirs = NULL;
ulint srv_n_log_groups = ULINT_MAX;
ulint srv_n_log_files = ULINT_MAX;
ulint srv_log_file_size = ULINT_MAX; /* size in database pages */
-ibool srv_log_archive_on = TRUE;
+ibool srv_log_archive_on = FALSE;
ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */
ulint srv_flush_log_at_trx_commit = 1;
@@ -137,9 +141,14 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1
, 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
};
-ulint srv_pool_size = ULINT_MAX; /* size in database pages;
- MySQL originally sets this
- value in bytes */
+ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits
+ this to size in kilobytes but
+ we normalize this to pages in
+ srv_boot() */
+ulint srv_awe_window_size = 0; /* size in pages; MySQL inits
+ this to bytes, but we
+ normalize it to pages in
+ srv_boot() */
ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */
ulint srv_lock_table_size = ULINT_MAX;
@@ -154,6 +163,8 @@ char* srv_file_flush_method_str = NULL;
ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+ulint srv_max_n_open_files = 300;
+
/* The InnoDB main thread tries to keep the ratio of modified pages
in the buffer pool to all database pages in the buffer pool smaller than
the following number. But it is not guaranteed that the value stays below
@@ -211,7 +222,7 @@ struct srv_conc_slot_struct{
UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue of threads
waiting to get in */
-srv_conc_slot_t* srv_conc_slots; /* array of wait
+srv_conc_slot_t* srv_conc_slots; /* array of wait
slots */
/* Number of times a thread is allowed to enter InnoDB within the same
@@ -228,6 +239,13 @@ ibool srv_use_doublewrite_buf = TRUE;
ibool srv_set_thread_priorities = TRUE;
int srv_query_thread_priority = 0;
+
+/* TRUE if the Address Windowing Extensions of Windows are used; then we must
+disable adaptive hash indexes */
+ibool srv_use_awe = FALSE;
+ibool srv_use_adaptive_hash_indexes = TRUE;
+
+
/*-------------------------------------------*/
ulint srv_n_spin_wait_rounds = 20;
ulint srv_spin_wait_delay = 5;
@@ -615,7 +633,8 @@ srv_suspend_thread(void)
if (srv_print_thread_releases) {
printf("Suspending thread %lu to slot %lu meter %lu\n",
- os_thread_get_curr_id(), slot_no, srv_meter[SRV_RECOVERY]);
+ (ulong) os_thread_get_curr_id(), (ulong) slot_no,
+ (ulong) srv_meter[SRV_RECOVERY]);
}
slot = srv_table_get_nth_slot(slot_no);
@@ -677,7 +696,8 @@ srv_release_threads(
if (srv_print_thread_releases) {
printf(
"Releasing thread %lu type %lu from slot %lu meter %lu\n",
- slot->id, type, i, srv_meter[SRV_RECOVERY]);
+ (ulong) slot->id, (ulong) type, (ulong) i,
+ (ulong) srv_meter[SRV_RECOVERY]);
}
count++;
@@ -784,7 +804,7 @@ srv_init(void)
UT_LIST_INIT(srv_conc_queue);
srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
-
+
for (i = 0; i < OS_THREAD_MAX_N; i++) {
conc_slot = srv_conc_slots + i;
conc_slot->reserved = FALSE;
@@ -908,6 +928,7 @@ retry:
slot = srv_conc_slots + i;
if (!slot->reserved) {
+
break;
}
}
@@ -1102,9 +1123,19 @@ srv_normalize_init_values(void)
srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
- srv_pool_size = srv_pool_size / UNIV_PAGE_SIZE;
+ srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024);
+
+ srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE;
- srv_lock_table_size = 20 * srv_pool_size;
+ if (srv_use_awe) {
+ /* If we are using AWE we must save memory in the 32-bit
+ address space of the process, and cannot bind the lock
+ table size to the real buffer pool size. */
+
+ srv_lock_table_size = 20 * srv_awe_window_size;
+ } else {
+ srv_lock_table_size = 5 * srv_pool_size;
+ }
return(DB_SUCCESS);
}
@@ -1170,7 +1201,7 @@ srv_table_reserve_slot_for_mysql(void)
" InnoDB: There appear to be %lu MySQL threads currently waiting\n"
"InnoDB: inside InnoDB, which is the upper limit. Cannot continue operation.\n"
"InnoDB: We intentionally generate a seg fault to print a stack trace\n"
-"InnoDB: on Linux. But first we print a list of waiting threads.\n", i);
+"InnoDB: on Linux. But first we print a list of waiting threads.\n", (ulong) i);
for (i = 0; i < OS_THREAD_MAX_N; i++) {
@@ -1178,10 +1209,10 @@ srv_table_reserve_slot_for_mysql(void)
fprintf(stderr,
"Slot %lu: thread id %lu, type %lu, in use %lu, susp %lu, time %lu\n",
- i, os_thread_pf(slot->id),
- slot->type, slot->in_use,
- slot->suspended,
- (ulint)difftime(ut_time(), slot->suspend_time));
+ (ulong) i, (ulong) os_thread_pf(slot->id),
+ (ulong) slot->type, (ulong) slot->in_use,
+ (ulong) slot->suspended,
+ (ulong) difftime(ut_time(), slot->suspend_time));
}
ut_error;
@@ -1400,7 +1431,7 @@ srv_sprintf_innodb_monitor(
char* buf_end = buf + len - 2000;
double time_elapsed;
time_t current_time;
- ulint n_reserved;
+ ulint n_reserved;
mutex_enter(&srv_innodb_monitor_mutex);
@@ -1428,7 +1459,7 @@ srv_sprintf_innodb_monitor(
buf += sprintf(buf,
"Per second averages calculated from the last %lu seconds\n",
- (ulint)time_elapsed);
+ (ulong) time_elapsed);
buf += sprintf(buf, "----------\n"
"SEMAPHORES\n"
@@ -1501,9 +1532,24 @@ srv_sprintf_innodb_monitor(
"BUFFER POOL AND MEMORY\n"
"----------------------\n");
buf += sprintf(buf,
- "Total memory allocated %lu; in additional pool allocated %lu\n",
+ "Total memory allocated " ULINTPF
+ "; in additional pool allocated " ULINTPF "\n",
ut_total_allocated_memory,
mem_pool_get_reserved(mem_comm_pool));
+
+ if (mem_out_of_mem_err_msg_count > 0) {
+ buf += sprintf(buf,
+ "Mem allocation has spilled out of additional mem pool" ULINTPF
+ "times\n",
+ mem_out_of_mem_err_msg_count);
+ }
+
+ if (srv_use_awe) {
+ buf += sprintf(buf,
+ "In addition to that %lu MB of AWE memory allocated\n",
+ (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE)));
+ }
+
buf_print_io(buf, buf_end);
buf = buf + strlen(buf);
ut_a(buf < buf_end + 1500);
@@ -1512,30 +1558,32 @@ srv_sprintf_innodb_monitor(
"ROW OPERATIONS\n"
"--------------\n");
buf += sprintf(buf,
- "%ld queries inside InnoDB, %lu queries in queue\n",
- srv_conc_n_threads, srv_conc_n_waiting_threads);
+ "%ld queries inside InnoDB, %lu queries in queue\n",
+ (long) srv_conc_n_threads,
+ (ulong) srv_conc_n_waiting_threads);
- n_reserved = fil_space_get_n_reserved_extents(0);
- if (n_reserved > 0) {
- buf += sprintf(buf,
- "%lu tablespace extents now reserved for B-tree split operations\n",
- n_reserved);
- }
+ n_reserved = fil_space_get_n_reserved_extents(0);
+ if (n_reserved > 0) {
+ buf += sprintf(buf,
+ "%lu tablespace extents now reserved for B-tree split operations\n",
+ (ulong) n_reserved);
+ }
#ifdef UNIV_LINUX
buf += sprintf(buf,
"Main thread process no. %lu, id %lu, state: %.29s\n",
- srv_main_thread_process_no,
- srv_main_thread_id,
- srv_main_thread_op_info);
+ (ulong) srv_main_thread_process_no,
+ (ulong) srv_main_thread_id,
+ srv_main_thread_op_info);
#else
buf += sprintf(buf,
"Main thread id %lu, state: %.29s\n",
- srv_main_thread_id,
+ (ulong) srv_main_thread_id,
srv_main_thread_op_info);
#endif
buf += sprintf(buf,
- "Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n",
+ "Number of rows inserted " ULINTPF
+ ", updated " ULINTPF ", deleted " ULINTPF ", read " ULINTPF "\n",
srv_n_rows_inserted,
srv_n_rows_updated,
srv_n_rows_deleted,
@@ -1760,8 +1808,13 @@ srv_error_monitor_thread(
os_thread_create */
{
ulint cnt = 0;
+ dulint old_lsn;
+ dulint new_lsn;
UT_NOT_USED(arg);
+
+ old_lsn = srv_start_lsn;
+
#ifdef UNIV_DEBUG_THREAD_CREATION
printf("Error monitor thread starts, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
@@ -1771,7 +1824,24 @@ loop:
cnt++;
- os_thread_sleep(2000000);
+ /* Try to track a strange bug reported by Harald Fuchs and others,
+ where the lsn seems to decrease at times */
+
+ new_lsn = log_get_lsn();
+
+ if (ut_dulint_cmp(new_lsn, old_lsn) < 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Error: old log sequence number %lu %lu was greater\n"
+"InnoDB: than the new log sequence number %lu %lu!\n"
+"InnoDB: Please send a bug report to mysql@lists.mysql.com\n",
+ (ulong) ut_dulint_get_high(old_lsn),
+ (ulong) ut_dulint_get_low(old_lsn),
+ (ulong) ut_dulint_get_high(new_lsn),
+ (ulong) ut_dulint_get_low(new_lsn));
+ }
+
+ old_lsn = new_lsn;
if (difftime(time(NULL), srv_last_monitor_time) > 60) {
/* We referesh InnoDB Monitor values so that averages are
@@ -1788,6 +1858,8 @@ loop:
fflush(stderr);
fflush(stdout);
+ os_thread_sleep(2000000);
+
if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
goto loop;
@@ -1955,6 +2027,9 @@ loop:
srv_main_thread_op_info = (char*)"flushing log";
log_buffer_flush_to_disk();
+ srv_main_thread_op_info = (char*)"making checkpoint";
+ log_free_check();
+
/* If there were less than 5 i/os during the
one second sleep, we assume that there is free
disk i/o capacity available, and it makes sense to
@@ -2218,11 +2293,13 @@ flush_loop:
goto loop;
}
mutex_exit(&kernel_mutex);
-
+/*
srv_main_thread_op_info =
(char*)"archiving log (if log archive is on)";
log_archive_do(FALSE, &n_bytes_archived);
+*/
+ n_bytes_archived = 0;
/* Keep looping in the background loop if still work to do */
@@ -2287,7 +2364,7 @@ suspend_thread:
os_thread_exit(NULL);
#ifndef __WIN__
- return(NULL);
+ return(NULL); /* Not reached */
#else
return(0);
#endif
diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
index 0491aed29f5..7b50877709b 100644
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -30,6 +30,7 @@ Created 2/16/1996 Heikki Tuuri
#include "page0cur.h"
#include "trx0trx.h"
#include "dict0boot.h"
+#include "dict0load.h"
#include "trx0sys.h"
#include "dict0crea.h"
#include "btr0btr.h"
@@ -55,6 +56,13 @@ Created 2/16/1996 Heikki Tuuri
#include "srv0start.h"
#include "que0que.h"
+/* Log sequence number immediately after startup */
+dulint srv_start_lsn;
+/* Log sequence number at shutdown */
+dulint srv_shutdown_lsn;
+
+ibool srv_start_raw_disk_in_use = FALSE;
+
ibool srv_start_has_been_called = FALSE;
ulint srv_sizeof_trx_t_in_ha_innodb_cc;
@@ -86,13 +94,6 @@ ibool srv_os_test_mutex_is_locked = FALSE;
#define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
#define SRV_MAX_N_PENDING_SYNC_IOS 100
-/* The following limit may be too big in some old operating systems:
-we may get an assertion failure in os0file.c */
-
-#define SRV_MAX_N_OPEN_FILES 500
-
-#define SRV_LOG_SPACE_FIRST_ID 1000000000
-
/*************************************************************************
Reads the data files and their sizes from a character string given in
the .cnf file. */
@@ -136,7 +137,8 @@ srv_parse_data_file_paths_and_sizes(
while ((*str != ':' && *str != '\0')
|| (*str == ':'
- && (*(str + 1) == '\\' || *(str + 1) == '/'))) {
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
str++;
}
@@ -233,11 +235,15 @@ srv_parse_data_file_paths_and_sizes(
while (*str != '\0') {
path = str;
- /* Note that we must ignore the ':' in a Windows path */
+ /* Note that we must step over the ':' in a Windows path;
+ a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+ a Windows raw partition may have a specification like
+ \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
while ((*str != ':' && *str != '\0')
|| (*str == ':'
- && (*(str + 1) == '\\' || *(str + 1) == '/'))) {
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
str++;
}
@@ -433,7 +439,7 @@ io_handler_thread(
os_thread_exit(NULL);
#ifndef __WIN__
- return(NULL);
+ return(NULL); /* Not reached */
#else
return(0);
#endif
@@ -451,7 +457,8 @@ Normalizes a directory path for Windows: converts slashes to backslashes. */
void
srv_normalize_path_for_win(
/*=======================*/
- char* str __attribute__((unused))) /* in/out: null-terminated character string */
+ char* str __attribute__((unused))) /* in/out: null-terminated
+ character string */
{
#ifdef __WIN__
ulint i;
@@ -504,7 +511,7 @@ srv_calc_low32(
expressed in bytes */
ulint file_size) /* in: file size in database pages */
{
- return(0xFFFFFFFF & (file_size << UNIV_PAGE_SIZE_SHIFT));
+ return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT));
}
/*************************************************************************
@@ -539,7 +546,6 @@ open_or_create_log_file(
ulint i) /* in: log file number in group */
{
ibool ret;
- ulint arch_space_id;
ulint size;
ulint size_high;
char name[10000];
@@ -552,12 +558,12 @@ open_or_create_log_file(
srv_log_group_home_dirs[k] = srv_add_path_separator_if_needed(
srv_log_group_home_dirs[k]);
- sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", i);
+ sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", (ulong) i);
files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL,
OS_LOG_FILE, &ret);
if (ret == FALSE) {
- if (os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) {
+ if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS) {
fprintf(stderr,
"InnoDB: Error in creating or opening %s\n", name);
@@ -582,9 +588,9 @@ open_or_create_log_file(
fprintf(stderr,
"InnoDB: Error: log file %s is of different size %lu %lu bytes\n"
"InnoDB: than specified in the .cnf file %lu %lu bytes!\n",
- name, size_high, size,
- srv_calc_high32(srv_log_file_size),
- srv_calc_low32(srv_log_file_size));
+ name, (ulong) size_high, (ulong) size,
+ (ulong) srv_calc_high32(srv_log_file_size),
+ (ulong) srv_calc_low32(srv_log_file_size));
return(DB_ERROR);
}
@@ -602,7 +608,7 @@ open_or_create_log_file(
}
fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n",
- name, srv_log_file_size
+ name, (ulong) srv_log_file_size
>> (20 - UNIV_PAGE_SIZE_SHIFT));
fprintf(stderr,
@@ -634,24 +640,27 @@ open_or_create_log_file(
ut_a(fil_validate());
fil_node_create(name, srv_log_file_size,
- 2 * k + SRV_LOG_SPACE_FIRST_ID);
-
+ 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE);
+#ifdef notdefined
/* If this is the first log group, create the file space object
- for archived logs */
+ for archived logs.
+ Under MySQL, no archiving ever done. */
if (k == 0 && i == 0) {
arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID;
- fil_space_create((char*) "arch_log_space", arch_space_id, FIL_LOG);
+ fil_space_create((char*) "arch_log_space", arch_space_id,
+ FIL_LOG);
} else {
arch_space_id = ULINT_UNDEFINED;
}
-
+#endif
if (i == 0) {
log_group_init(k, srv_n_log_files,
srv_log_file_size * UNIV_PAGE_SIZE,
2 * k + SRV_LOG_SPACE_FIRST_ID,
- arch_space_id);
+ SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch
+ space id */
}
return(DB_SUCCESS);
@@ -686,7 +695,7 @@ open_or_create_data_files(
if (srv_n_data_files >= 1000) {
fprintf(stderr, "InnoDB: can only have < 1000 data files\n"
"InnoDB: you have defined %lu\n",
- srv_n_data_files);
+ (ulong) srv_n_data_files);
return(DB_ERROR);
}
@@ -702,18 +711,32 @@ open_or_create_data_files(
sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]);
- files[i] = os_file_create(name, OS_FILE_CREATE,
+ if (srv_data_file_is_raw_partition[i] == 0) {
+
+ /* First we try to create the file: if it already
+ exists, ret will get value FALSE */
+
+ files[i] = os_file_create(name, OS_FILE_CREATE,
OS_FILE_NORMAL, OS_DATA_FILE, &ret);
- if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+ if (ret == FALSE && os_file_get_last_error(FALSE) !=
+ OS_FILE_ALREADY_EXISTS) {
+ fprintf(stderr,
+ "InnoDB: Error in creating or opening %s\n",
+ name);
+
+ return(DB_ERROR);
+ }
+ } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
/* The partition is opened, not created; then it is
written over */
+ srv_start_raw_disk_in_use = TRUE;
srv_created_new_raw = TRUE;
files[i] = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_NORMAL,
- OS_DATA_FILE, &ret);
+ name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
if (!ret) {
fprintf(stderr,
"InnoDB: Error in opening %s\n", name);
@@ -721,19 +744,15 @@ open_or_create_data_files(
return(DB_ERROR);
}
} else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+ srv_start_raw_disk_in_use = TRUE;
+
ret = FALSE;
+ } else {
+ ut_a(0);
}
if (ret == FALSE) {
- if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW
- && os_file_get_last_error() !=
- OS_FILE_ALREADY_EXISTS) {
- fprintf(stderr,
- "InnoDB: Error in creating or opening %s\n",
- name);
-
- return(DB_ERROR);
- }
+ /* We open the data file */
if (one_created) {
fprintf(stderr,
@@ -744,71 +763,81 @@ open_or_create_data_files(
return(DB_ERROR);
}
- files[i] = os_file_create(
- name, OS_FILE_OPEN, OS_FILE_NORMAL,
- OS_DATA_FILE, &ret);
+ if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ } else {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ }
+
if (!ret) {
fprintf(stderr,
"InnoDB: Error in opening %s\n", name);
- os_file_get_last_error();
+ os_file_get_last_error(TRUE);
return(DB_ERROR);
}
- if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW) {
-
- ret = os_file_get_size(files[i], &size,
- &size_high);
- ut_a(ret);
- /* Round size downward to megabytes */
+ if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+
+ goto skip_size_check;
+ }
+
+ ret = os_file_get_size(files[i], &size, &size_high);
+ ut_a(ret);
+ /* Round size downward to megabytes */
- rounded_size_pages = (size / (1024 * 1024)
+ rounded_size_pages = (size / (1024 * 1024)
+ 4096 * size_high)
<< (20 - UNIV_PAGE_SIZE_SHIFT);
- if (i == srv_n_data_files - 1
+ if (i == srv_n_data_files - 1
&& srv_auto_extend_last_data_file) {
- if (srv_data_file_sizes[i] >
+ if (srv_data_file_sizes[i] >
rounded_size_pages
|| (srv_last_file_size_max > 0
&& srv_last_file_size_max <
rounded_size_pages)) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Error: auto-extending data file %s is of a different size\n"
"InnoDB: %lu pages (rounded down to MB) than specified in the .cnf file:\n"
"InnoDB: initial %lu pages, max %lu (relevant if non-zero) pages!\n",
- name, rounded_size_pages,
- srv_data_file_sizes[i], srv_last_file_size_max);
+ name, (ulong) rounded_size_pages,
+ (ulong) srv_data_file_sizes[i],
+ (ulong) srv_last_file_size_max);
- return(DB_ERROR);
- }
-
- srv_data_file_sizes[i] =
- rounded_size_pages;
+ return(DB_ERROR);
}
+
+ srv_data_file_sizes[i] = rounded_size_pages;
+ }
- if (rounded_size_pages
- != srv_data_file_sizes[i]) {
+ if (rounded_size_pages != srv_data_file_sizes[i]) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Error: data file %s is of a different size\n"
"InnoDB: %lu pages (rounded down to MB)\n"
"InnoDB: than specified in the .cnf file %lu pages!\n", name,
- rounded_size_pages,
- srv_data_file_sizes[i]);
+ (ulong) rounded_size_pages,
+ (ulong) srv_data_file_sizes[i]);
- return(DB_ERROR);
- }
+ return(DB_ERROR);
}
-
+skip_size_check:
fil_read_flushed_lsn_and_arch_log_no(files[i],
one_opened,
min_flushed_lsn, min_arch_log_no,
max_flushed_lsn, max_arch_log_no);
one_opened = TRUE;
} else {
+ /* We created the data file and now write it full of
+ zeros */
+
one_created = TRUE;
if (i > 0) {
@@ -826,7 +855,7 @@ open_or_create_data_files(
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Setting file %s size to %lu MB\n",
- name, (srv_data_file_sizes[i]
+ name, (ulong) (srv_data_file_sizes[i]
>> (20 - UNIV_PAGE_SIZE_SHIFT)));
fprintf(stderr,
@@ -856,7 +885,13 @@ open_or_create_data_files(
ut_a(fil_validate());
- fil_node_create(name, srv_data_file_sizes[i], 0);
+ if (srv_data_file_is_raw_partition[i]) {
+
+ fil_node_create(name, srv_data_file_sizes[i], 0, TRUE);
+ } else {
+ fil_node_create(name, srv_data_file_sizes[i], 0,
+ FALSE);
+ }
}
ios = 0;
@@ -877,6 +912,7 @@ innobase_start_or_create_for_mysql(void)
/*====================================*/
/* out: DB_SUCCESS or error code */
{
+ buf_pool_t* ret;
ibool create_new_db;
ibool log_file_created;
ibool log_created = FALSE;
@@ -885,15 +921,24 @@ innobase_start_or_create_for_mysql(void)
dulint max_flushed_lsn;
ulint min_arch_log_no;
ulint max_arch_log_no;
- ibool start_archive;
ulint sum_of_new_sizes;
ulint sum_of_data_file_sizes;
ulint tablespace_size_in_header;
ulint err;
ulint i;
- ulint k;
+ ibool srv_file_per_table_original_value = srv_file_per_table;
mtr_t mtr;
+ if (sizeof(ulint) != sizeof(void*)) {
+ fprintf(stderr,
+"InnoDB: Error: size of InnoDB's ulint is %lu, but size of void* is %lu.\n"
+"InnoDB: The sizes should be the same so that on a 64-bit platform you can\n"
+"InnoDB: allocate more than 4 GB of memory.",
+ (ulong)sizeof(ulint), (ulong)sizeof(void*));
+ }
+
+ srv_file_per_table = FALSE; /* system tables are created in tablespace
+ 0 */
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!!!!!!!\n");
@@ -914,12 +959,17 @@ innobase_start_or_create_for_mysql(void)
"InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
+#ifdef UNIV_SIMULATE_AWE
+ fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_SIMULATE_AWE switched on !!!!!!!!!!!!!!!!!\n");
+#endif
if (srv_sizeof_trx_t_in_ha_innodb_cc != (ulint)sizeof(trx_t)) {
fprintf(stderr,
"InnoDB: Error: trx_t size is %lu in ha_innodb.cc but %lu in srv0start.c\n"
"InnoDB: Check that pthread_mutex_t is defined in the same way in these\n"
"InnoDB: compilation modules. Cannot continue.\n",
- srv_sizeof_trx_t_in_ha_innodb_cc, (ulint)sizeof(trx_t));
+ (ulong) srv_sizeof_trx_t_in_ha_innodb_cc,
+ (ulong) sizeof(trx_t));
return(DB_ERROR);
}
@@ -944,6 +994,17 @@ innobase_start_or_create_for_mysql(void)
srv_startup_is_before_trx_rollback_phase = TRUE;
os_aio_use_native_aio = FALSE;
+#if !defined(__WIN2000__) && !defined(UNIV_SIMULATE_AWE)
+ if (srv_use_awe) {
+
+ fprintf(stderr,
+"InnoDB: Error: You have specified innodb_buffer_pool_awe_mem_mb\n"
+"InnoDB: in my.cnf, but AWE can only be used in Windows 2000 and later.\n");
+
+ return(DB_ERROR);
+ }
+#endif
+
#ifdef __WIN__
if (os_get_os_version() == OS_WIN95
|| os_get_os_version() == OS_WIN31
@@ -1006,6 +1067,31 @@ innobase_start_or_create_for_mysql(void)
srv_file_flush_method_str);
return(DB_ERROR);
}
+
+ /* Set the maximum number of threads which can wait for a semaphore
+ inside InnoDB */
+#if defined(__WIN__) || defined(__NETWARE__)
+
+/* Create less event semaphores because Win 98/ME had difficulty creating
+40000 event semaphores.
+Comment from Novell, Inc.: also, these just take a lot of memory on
+NetWare. */
+ srv_max_n_threads = 1000;
+#else
+ if (srv_pool_size >= 8 * 1024) {
+ /* Here we still have srv_pool_size counted
+ in kilobytes, srv_boot converts the value to
+ pages; if buffer pool is less than 8 MB,
+ assume fewer threads. */
+ srv_max_n_threads = 10000;
+ } else {
+ srv_max_n_threads = 1000; /* saves several MB of memory,
+ especially in 64-bit
+ computers */
+ }
+#endif
+ /* Note that the call srv_boot() also changes the values of
+ srv_pool_size etc. to the units used by InnoDB internally */
/* Set the maximum number of threads which can wait for a semaphore
inside InnoDB */
@@ -1044,7 +1130,6 @@ NetWare. */
if (!os_aio_use_native_aio) {
/* In simulated aio we currently have use only for 4 threads */
-
srv_n_file_io_threads = 4;
os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
@@ -1058,9 +1143,28 @@ NetWare. */
SRV_MAX_N_PENDING_SYNC_IOS);
}
- fil_init(SRV_MAX_N_OPEN_FILES);
+ fil_init(srv_max_n_open_files);
- buf_pool_init(srv_pool_size, srv_pool_size);
+ if (srv_use_awe) {
+ fprintf(stderr,
+"InnoDB: Using AWE: Memory window is %lu MB and AWE memory is %lu MB\n",
+ (ulong) (srv_awe_window_size / ((1024 * 1024) / UNIV_PAGE_SIZE)),
+ (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE)));
+
+ /* We must disable adaptive hash indexes because they do not
+ tolerate remapping of pages in AWE */
+
+ srv_use_adaptive_hash_indexes = FALSE;
+ ret = buf_pool_init(srv_pool_size, srv_pool_size,
+ srv_awe_window_size);
+ } else {
+ ret = buf_pool_init(srv_pool_size, srv_pool_size,
+ srv_pool_size);
+ }
+
+ if (ret == NULL) {
+ return(DB_ERROR);
+ }
fsp_init();
log_init();
@@ -1071,7 +1175,6 @@ NetWare. */
for (i = 0; i < srv_n_file_io_threads; i++) {
n[i] = i;
-
os_thread_create(io_handler_thread, n + i, thread_ids + i);
}
@@ -1084,7 +1187,6 @@ NetWare. */
}
if (srv_n_log_files * srv_log_file_size >= 262144) {
-
fprintf(stderr,
"InnoDB: Error: combined size of log files must be < 4 GB\n");
@@ -1130,42 +1232,25 @@ NetWare. */
return((int) err);
}
- if (!create_new_db) {
- /* If we are using the doublewrite method, we will
- check if there are half-written pages in data files,
- and restore them from the doublewrite buffer if
- possible */
-
- if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
-
- trx_sys_doublewrite_restore_corrupt_pages();
- }
- }
-
srv_normalize_path_for_win(srv_arch_dir);
srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
+
+ for (i = 0; i < srv_n_log_files; i++) {
+ err = open_or_create_log_file(create_new_db, &log_file_created,
+ log_opened, 0, i);
+ if (err != DB_SUCCESS) {
- for (k = 0; k < srv_n_log_groups; k++) {
-
- for (i = 0; i < srv_n_log_files; i++) {
-
- err = open_or_create_log_file(create_new_db,
- &log_file_created,
- log_opened, k, i);
- if (err != DB_SUCCESS) {
-
- return((int) err);
- }
-
- if (log_file_created) {
- log_created = TRUE;
- } else {
- log_opened = TRUE;
- }
+ return((int) err);
+ }
- if ((log_opened && create_new_db)
+ if (log_file_created) {
+ log_created = TRUE;
+ } else {
+ log_opened = TRUE;
+ }
+ if ((log_opened && create_new_db)
|| (log_opened && log_created)) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Error: all log files must be created at the same time.\n"
"InnoDB: All log files must be created also in database creation.\n"
"InnoDB: If you want bigger or smaller log files, shut down the\n"
@@ -1173,14 +1258,16 @@ NetWare. */
"InnoDB: Then delete the existing log files. Edit the .cnf file\n"
"InnoDB: and start the database again.\n");
- return(DB_ERROR);
- }
-
+ return(DB_ERROR);
}
}
- if (log_created && !create_new_db && !srv_archive_recovery) {
+ /* Open all log files and data files in the system tablespace: we
+ keep them open until database shutdown */
+ fil_open_log_and_system_tablespace_files();
+
+ if (log_created && !create_new_db && !srv_archive_recovery) {
if (ut_dulint_cmp(max_flushed_lsn, min_flushed_lsn) != 0
|| max_arch_log_no != min_arch_log_no) {
fprintf(stderr,
@@ -1205,7 +1292,9 @@ NetWare. */
mutex_enter(&(log_sys->mutex));
- recv_reset_logs(max_flushed_lsn, max_arch_log_no + 1, TRUE);
+ /* Do not + 1 arch_log_no because we do not use log
+ archiving */
+ recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE);
mutex_exit(&(log_sys->mutex));
}
@@ -1224,7 +1313,6 @@ NetWare. */
} else if (srv_archive_recovery) {
fprintf(stderr,
"InnoDB: Starting archive recovery from a backup...\n");
-
err = recv_recovery_from_archive_start(
min_flushed_lsn,
srv_archive_recovery_limit_lsn,
@@ -1233,14 +1321,11 @@ NetWare. */
return(DB_ERROR);
}
-
/* Since ibuf init is in dict_boot, and ibuf is needed
in any disk i/o, first call dict_boot */
dict_boot();
-
trx_sys_init_at_db_start();
-
srv_startup_is_before_trx_rollback_phase = FALSE;
/* Initialize the fsp free limit global variable in the log
@@ -1250,7 +1335,7 @@ NetWare. */
recv_recovery_from_archive_finish();
} else {
/* We always try to do a recovery, even if the database had
- been shut down normally */
+ been shut down normally: this is the normal startup path */
err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT,
ut_dulint_max,
@@ -1296,6 +1381,8 @@ NetWare. */
log_make_checkpoint_at(ut_dulint_max, TRUE);
+#ifdef notdefined
+ /* Archiving is always off under MySQL */
if (!srv_log_archive_on) {
ut_a(DB_SUCCESS == log_archive_noarchivelog());
} else {
@@ -1313,6 +1400,14 @@ NetWare. */
ut_a(DB_SUCCESS == log_archive_archivelog());
}
}
+#endif
+ if (!create_new_db && srv_force_recovery == 0) {
+ /* After a crash recovery we only check that the info in data
+ dictionary is consistent with what we already know about space
+ id's from the call of fil_load_single_table_tablespaces(). */
+
+ dict_check_tablespaces_or_store_max_id(recv_needed_recovery);
+ }
if (srv_measure_contention) {
/* os_thread_create(&test_measure_cont, NULL, thread_ids +
@@ -1326,17 +1421,28 @@ NetWare. */
and prints InnoDB monitor info */
os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL,
- thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+ thread_ids + 2 + SRV_MAX_N_IO_THREADS);
/* Create the thread which warns of long semaphore waits */
os_thread_create(&srv_error_monitor_thread, NULL,
- thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+ thread_ids + 3 + SRV_MAX_N_IO_THREADS);
srv_was_started = TRUE;
srv_is_being_started = FALSE;
+#ifdef UNIV_DEBUG
+ /* Wait a while so that the created threads have time to suspend
+ themselves before we switch sync debugging on; otherwise a thread may
+ execute mutex_enter() before the checks are on, and mutex_exit() after
+ the checks are on, which will cause an assertion failure in sync
+ debug. */
+
+ os_thread_sleep(3000000);
+#endif
sync_order_checks_on = TRUE;
- if (srv_use_doublewrite_buf && trx_doublewrite == NULL) {
+ if (srv_use_doublewrite_buf && trx_doublewrite == NULL) {
+ /* Create the doublewrite buffer to a new tablespace */
+
trx_sys_create_doublewrite_buf();
}
@@ -1346,8 +1452,8 @@ NetWare. */
return((int)DB_ERROR);
}
- /* Create the master thread which monitors the database
- server, and does purge and other utility operations */
+ /* Create the master thread which does purge and other utility
+ operations */
os_thread_create(&srv_master_thread, NULL, thread_ids + 1 +
SRV_MAX_N_IO_THREADS);
@@ -1367,7 +1473,8 @@ NetWare. */
fprintf(stderr,
"InnoDB: Error: tablespace size stored in header is %lu pages, but\n"
"InnoDB: the sum of data file sizes is %lu pages\n",
- tablespace_size_in_header, sum_of_data_file_sizes);
+ (ulong) tablespace_size_in_header,
+ (ulong) sum_of_data_file_sizes);
}
if (srv_auto_extend_last_data_file
@@ -1376,10 +1483,11 @@ NetWare. */
fprintf(stderr,
"InnoDB: Error: tablespace size stored in header is %lu pages, but\n"
"InnoDB: the sum of data file sizes is only %lu pages\n",
- tablespace_size_in_header, sum_of_data_file_sizes);
+ (ulong) tablespace_size_in_header,
+ (ulong) sum_of_data_file_sizes);
}
- /* Check that os_fast_mutexes work as exptected */
+ /* Check that os_fast_mutexes work as expected */
os_fast_mutex_init(&srv_os_test_mutex);
if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
@@ -1397,43 +1505,73 @@ NetWare. */
os_fast_mutex_free(&srv_os_test_mutex);
- /***********************************************************/
- /* Do NOT merge to the 4.1 code base! */
- if (trx_sys_downgrading_from_4_1_1) {
+ if (srv_print_verbose_log) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Started; log sequence number %lu %lu\n",
+ (ulong) ut_dulint_get_high(srv_start_lsn),
+ (ulong) ut_dulint_get_low(srv_start_lsn));
+ }
+
+ if (srv_force_recovery > 0) {
fprintf(stderr,
-"InnoDB: You are downgrading from an InnoDB version which allows multiple\n"
+ "InnoDB: !!! innodb_force_recovery is set to %lu !!!\n",
+ (ulong) srv_force_recovery);
+ }
+
+ fflush(stderr);
+
+ if (trx_doublewrite_must_reset_space_ids) {
+ /* Actually, we did not change the undo log format between
+ 4.0 and 4.1.1, and we would not need to run purge to
+ completion. Note also that the purge algorithm in 4.1.1
+ can process the the history list again even after a full
+ purge, because our algorithm does not cut the end of the
+ history list in all cases so that it would become empty
+ after a full purge. That mean that we may purge 4.0 type
+ undo log even after this phase.
+
+ The insert buffer record format changed between 4.0 and
+ 4.1.1. It is essential that the insert buffer is emptied
+ here! */
+
+ fprintf(stderr,
+"InnoDB: You are upgrading to an InnoDB version which allows multiple\n"
"InnoDB: tablespaces. Wait that purge and insert buffer merge run to\n"
"InnoDB: completion...\n");
for (;;) {
- os_thread_sleep(10000000);
+ os_thread_sleep(1000000);
if (0 == strcmp(srv_main_thread_op_info,
"waiting for server activity")) {
+
+ ut_a(ibuf_is_empty());
+
break;
}
}
fprintf(stderr,
"InnoDB: Full purge and insert buffer merge completed.\n");
- trx_sys_mark_downgraded_from_4_1_1();
+ trx_sys_mark_upgraded_to_multiple_tablespaces();
fprintf(stderr,
-"InnoDB: Downgraded from >= 4.1.1 to 4.0\n");
+"InnoDB: You have now successfully upgraded to the multiple tablespaces\n"
+"InnoDB: format. You should NOT DOWNGRADE again to an earlier version of\n"
+"InnoDB: InnoDB! But if you absolutely need to downgrade, see section 4.6 of\n"
+"InnoDB: http://www.innodb.com/ibman.php for instructions.\n");
}
- /***********************************************************/
- if (srv_print_verbose_log) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Started\n");
- }
+ if (srv_force_recovery == 0) {
+ /* In the insert buffer we may have even bigger tablespace
+ id's, because we may have dropped those tablespaces, but
+ insert buffer merge has not had time to clean the records from
+ the ibuf tree. */
- if (srv_force_recovery > 0) {
- fprintf(stderr,
- "InnoDB: !!! innodb_force_recovery is set to %lu !!!\n",
- srv_force_recovery);
+ ibuf_update_max_tablespace_id();
}
- fflush(stderr);
+ srv_file_per_table = srv_file_per_table_original_value;
return((int) DB_SUCCESS);
}
@@ -1452,17 +1590,16 @@ innobase_shutdown_for_mysql(void)
if (srv_is_being_started) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Warning: shutting down a not properly started\n");
- fprintf(stderr,
- " InnoDB: or created database!\n");
+" InnoDB: Warning: shutting down a not properly started\n"
+" InnoDB: or created database!\n");
}
return(DB_SUCCESS);
}
- /* 1. Flush buffer pool to disk, write the current lsn to
+ /* 1. Flush the buffer pool to disk, write the current lsn to
the tablespace header(s), and copy all log data to archive.
- The step 1 is the real InnoDB shutdown. The remaining steps
+ The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
just free data structures after the shutdown. */
logs_empty_and_mark_files_at_shutdown();
@@ -1486,16 +1623,16 @@ innobase_shutdown_for_mysql(void)
/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
HERE OR EARLIER */
- /* 1. Let the lock timeout thread exit */
+ /* a. Let the lock timeout thread exit */
os_event_set(srv_lock_timeout_thread_event);
- /* 2. srv error monitor thread exits automatically, no need
+ /* b. srv error monitor thread exits automatically, no need
to do anything here */
- /* 3. We wake the master thread so that it exits */
+ /* c. We wake the master thread so that it exits */
srv_wake_master_thread();
- /* 4. Exit the i/o threads */
+ /* d. Exit the i/o threads */
os_aio_wake_all_threads_at_shutdown();
@@ -1523,7 +1660,7 @@ innobase_shutdown_for_mysql(void)
if (i == 1000) {
fprintf(stderr,
"InnoDB: Warning: %lu threads created by InnoDB had not exited at shutdown!\n",
- os_thread_count);
+ (ulong) os_thread_count);
}
/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
@@ -1548,13 +1685,16 @@ innobase_shutdown_for_mysql(void)
fprintf(stderr,
"InnoDB: Warning: some resources were not cleaned up in shutdown:\n"
"InnoDB: threads %lu, events %lu, os_mutexes %lu, os_fast_mutexes %lu\n",
- os_thread_count, os_event_count, os_mutex_count,
- os_fast_mutex_count);
+ (ulong) os_thread_count, (ulong) os_event_count,
+ (ulong) os_mutex_count, (ulong) os_fast_mutex_count);
}
if (srv_print_verbose_log) {
ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Shutdown completed\n");
+ fprintf(stderr,
+" InnoDB: Shutdown completed; log sequence number %lu %lu\n",
+ (ulong) ut_dulint_get_high(srv_shutdown_lsn),
+ (ulong) ut_dulint_get_low(srv_shutdown_lsn));
}
return((int) DB_SUCCESS);
diff --git a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c
index 67671299e3d..8082f598b0c 100644
--- a/innobase/sync/sync0arr.c
+++ b/innobase/sync/sync0arr.c
@@ -457,8 +457,9 @@ sync_array_cell_print(
buf += sprintf(buf,
"--Thread %lu has waited at %s line %lu for %.2f seconds the semaphore:\n",
- os_thread_pf(cell->thread), cell->file, cell->line,
- difftime(time(NULL), cell->reservation_time));
+ (ulong) os_thread_pf(cell->thread), cell->file,
+ (ulong) cell->line,
+ difftime(time(NULL), cell->reservation_time));
if (type == SYNC_MUTEX) {
/* We use old_wait_mutex in case the cell has already
@@ -471,12 +472,12 @@ sync_array_cell_print(
"Last time reserved in file %s line %lu, "
#endif /* UNIV_SYNC_DEBUG */
"waiters flag %lu\n",
- mutex, mutex->cfile_name, mutex->cline,
- mutex->lock_word,
+ mutex, mutex->cfile_name, (ulong) mutex->cline,
+ (ulong) mutex->lock_word,
#ifdef UNIV_SYNC_DEBUG
- mutex->file_name, mutex->line,
+ mutex->file_name, (ulong) mutex->line,
#endif /* UNIV_SYNC_DEBUG */
- mutex->waiters);
+ (ulong) mutex->waiters);
} else if (type == RW_LOCK_EX || type == RW_LOCK_SHARED) {
@@ -490,11 +491,12 @@ sync_array_cell_print(
buf += sprintf(buf,
" RW-latch at %lx created in file %s line %lu\n",
- (ulint)rwlock, rwlock->cfile_name, rwlock->cline);
+ (ulong) rwlock, rwlock->cfile_name,
+ (ulong) rwlock->cline);
if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
buf += sprintf(buf,
"a writer (thread id %lu) has reserved it in mode",
- os_thread_pf(rwlock->writer_thread));
+ (ulong) os_thread_pf(rwlock->writer_thread));
if (rwlock->writer == RW_LOCK_EX) {
buf += sprintf(buf, " exclusive\n");
} else {
@@ -504,14 +506,16 @@ sync_array_cell_print(
buf += sprintf(buf,
"number of readers %lu, waiters flag %lu\n",
- rwlock->reader_count, rwlock->waiters);
+ (ulong) rwlock->reader_count,
+ (ulong) rwlock->waiters);
buf += sprintf(buf,
"Last time read locked in file %s line %lu\n",
- rwlock->last_s_file_name, rwlock->last_s_line);
+ rwlock->last_s_file_name,
+ (ulong) rwlock->last_s_line);
buf += sprintf(buf,
"Last time write locked in file %s line %lu\n",
- rwlock->last_x_file_name, rwlock->last_x_line);
+ rwlock->last_x_file_name, (ulong) rwlock->last_x_line);
} else {
ut_error;
}
@@ -660,8 +664,8 @@ sync_array_detect_deadlock(
sync_array_cell_print(buf, cell);
printf(
"Mutex %lx owned by thread %lu file %s line %lu\n%s",
- (ulint)mutex, os_thread_pf(mutex->thread_id),
- mutex->file_name, mutex->line, buf);
+ (ulong) mutex, (ulong) os_thread_pf(mutex->thread_id),
+ mutex->file_name, (ulong) mutex->line, buf);
return(TRUE);
}
@@ -695,7 +699,7 @@ sync_array_detect_deadlock(
depth);
if (ret) {
sync_array_cell_print(buf, cell);
- printf("rw-lock %lx %s ", (ulint) lock, buf);
+ printf("rw-lock %lx %s ", (ulong) lock, buf);
rw_lock_debug_print(debug);
return(TRUE);
}
@@ -727,7 +731,7 @@ sync_array_detect_deadlock(
depth);
if (ret) {
sync_array_cell_print(buf, cell);
- printf("rw-lock %lx %s ", (ulint) lock, buf);
+ printf("rw-lock %lx %s ", (ulong) lock, buf);
rw_lock_debug_print(debug);
return(TRUE);
@@ -991,7 +995,7 @@ sync_array_output_info(
buf += sprintf(buf,
"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
- arr->res_count, arr->sg_count);
+ (long) arr->res_count, (long) arr->sg_count);
i = 0;
count = 0;
diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
index 86924c437c7..93fd9f14575 100644
--- a/innobase/sync/sync0rw.c
+++ b/innobase/sync/sync0rw.c
@@ -125,6 +125,11 @@ rw_lock_create_func(
lock->last_x_line = 0;
mutex_enter(&rw_lock_list_mutex);
+
+ if (UT_LIST_GET_LEN(rw_lock_list) > 0) {
+ ut_a(UT_LIST_GET_FIRST(rw_lock_list)->magic_n
+ == RW_LOCK_MAGIC_N);
+ }
UT_LIST_ADD_FIRST(list, rw_lock_list, lock);
@@ -141,7 +146,7 @@ rw_lock_free(
/*=========*/
rw_lock_t* lock) /* in: rw-lock */
{
- ut_ad(rw_lock_validate(lock));
+ ut_a(rw_lock_validate(lock));
ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
ut_a(rw_lock_get_waiters(lock) == 0);
ut_a(rw_lock_get_reader_count(lock) == 0);
@@ -152,6 +157,13 @@ rw_lock_free(
mutex_enter(&rw_lock_list_mutex);
+ if (UT_LIST_GET_PREV(list, lock)) {
+ ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+ }
+ if (UT_LIST_GET_NEXT(list, lock)) {
+ ut_a(UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+ }
+
UT_LIST_REMOVE(list, rw_lock_list, lock);
mutex_exit(&rw_lock_list_mutex);
@@ -227,8 +239,8 @@ lock_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu spin wait rw-s-lock at %lx cfile %s cline %lu rnds %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)lock,
- lock->cfile_name, lock->cline, i);
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (ulong) lock,
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
}
mutex_enter(rw_lock_get_mutex(lock));
@@ -257,8 +269,8 @@ lock_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu OS wait rw-s-lock at %lx cfile %s cline %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)lock,
- lock->cfile_name, lock->cline);
+ (ulong) os_thread_pf(os_thread_get_curr_id()),
+ (ulong) lock, lock->cfile_name, (ulong) lock->cline);
}
rw_s_system_call_count++;
@@ -476,8 +488,8 @@ lock_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu spin wait rw-x-lock at %lx cfile %s cline %lu rnds %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)lock,
- lock->cfile_name, lock->cline, i);
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (ulong) lock,
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
}
rw_x_spin_wait_count++;
@@ -509,8 +521,8 @@ lock_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu OS wait for rw-x-lock at %lx cfile %s cline %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)lock,
- lock->cfile_name, lock->cline);
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (ulong) lock,
+ lock->cfile_name, (ulong) lock->cline);
}
rw_x_system_call_count++;
@@ -836,7 +848,8 @@ rw_lock_debug_print(
rwt = info->lock_type;
printf("Locked: thread %ld file %s line %ld ",
- os_thread_pf(info->thread_id), info->file_name, info->line);
+ (ulong) os_thread_pf(info->thread_id), info->file_name,
+ (ulong) info->line);
if (rwt == RW_LOCK_SHARED) {
printf("S-LOCK");
} else if (rwt == RW_LOCK_EX) {
@@ -847,7 +860,7 @@ rw_lock_debug_print(
ut_error;
}
if (info->pass != 0) {
- printf(" pass value %lu", info->pass);
+ printf(" pass value %lu", (ulong) info->pass);
}
printf("\n");
}
diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
index 4f5d27bcc7c..6ad766d8bc8 100644
--- a/innobase/sync/sync0sync.c
+++ b/innobase/sync/sync0sync.c
@@ -208,6 +208,10 @@ mutex_create_func(
mutex_enter(&mutex_list_mutex);
+ if (UT_LIST_GET_LEN(mutex_list) > 0) {
+ ut_a(UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N);
+ }
+
UT_LIST_ADD_FIRST(list, mutex_list, mutex);
mutex_exit(&mutex_list_mutex);
@@ -223,7 +227,7 @@ mutex_free(
/*=======*/
mutex_t* mutex) /* in: mutex */
{
- ut_ad(mutex_validate(mutex));
+ ut_a(mutex_validate(mutex));
ut_a(mutex_get_lock_word(mutex) == 0);
ut_a(mutex_get_waiters(mutex) == 0);
@@ -231,6 +235,15 @@ mutex_free(
mutex_enter(&mutex_list_mutex);
+ if (UT_LIST_GET_PREV(list, mutex)) {
+ ut_a(UT_LIST_GET_PREV(list, mutex)->magic_n
+ == MUTEX_MAGIC_N);
+ }
+ if (UT_LIST_GET_NEXT(list, mutex)) {
+ ut_a(UT_LIST_GET_NEXT(list, mutex)->magic_n
+ == MUTEX_MAGIC_N);
+ }
+
UT_LIST_REMOVE(list, mutex_list, mutex);
mutex_exit(&mutex_list_mutex);
@@ -354,8 +367,8 @@ spin_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu spin wait mutex at %lx cfile %s cline %lu rnds %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)mutex,
- mutex->cfile_name, mutex->cline, i);
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (ulong) mutex,
+ mutex->cfile_name, (ulong) mutex->cline, (ulong) i);
}
mutex_spin_round_count += i;
@@ -414,7 +427,8 @@ spin_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu spin wait succeeds at 2: mutex at %lx\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)mutex);
+ (ulong) os_thread_pf(os_thread_get_curr_id()),
+ (ulong) mutex);
}
return;
@@ -432,8 +446,8 @@ spin_loop:
if (srv_print_latch_waits) {
printf(
"Thread %lu OS wait mutex at %lx cfile %s cline %lu rnds %lu\n",
- os_thread_pf(os_thread_get_curr_id()), (ulint)mutex,
- mutex->cfile_name, mutex->cline, i);
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (ulong) mutex,
+ mutex->cfile_name, (ulong) mutex->cline, (ulong) i);
}
mutex_system_call_count++;
@@ -735,11 +749,11 @@ sync_thread_levels_g(
printf(
"InnoDB error: sync levels should be > %lu but a level is %lu\n",
- limit, slot->level);
+ (ulong) limit, (ulong) slot->level);
if (mutex->magic_n == MUTEX_MAGIC_N) {
printf("Mutex created at %s %lu\n", mutex->cfile_name,
- mutex->cline);
+ (ulong) mutex->cline);
if (mutex_get_lock_word(mutex) != 0) {
#ifdef UNIV_SYNC_DEBUG
@@ -752,7 +766,7 @@ sync_thread_levels_g(
fprintf(stderr,
"InnoDB: Locked mutex: addr %p thread %ld file %s line %ld\n",
- mutex, os_thread_pf(thread_id), file_name, line);
+ mutex, os_thread_pf(thread_id), file_name, (ulong) line);
#else /* UNIV_SYNC_DEBUG */
fprintf(stderr,
"InnoDB: Locked mutex: addr %p\n", mutex);
@@ -938,7 +952,7 @@ sync_thread_add_level(
}
array = thread_slot->levels;
-
+
/* NOTE that there is a problem with _NODE and _LEAF levels: if the
B-tree height changes, then a leaf can change to an internal node
or the other way around. We do not know at present if this can cause
@@ -1226,10 +1240,13 @@ sync_print_wait_info(
sprintf(buf,
"Mutex spin waits %lu, rounds %lu, OS waits %lu\n"
"RW-shared spins %lu, OS waits %lu; RW-excl spins %lu, OS waits %lu\n",
- mutex_spin_wait_count, mutex_spin_round_count,
- mutex_os_wait_count,
- rw_s_spin_wait_count, rw_s_os_wait_count,
- rw_x_spin_wait_count, rw_x_os_wait_count);
+ (ulong) mutex_spin_wait_count,
+ (ulong) mutex_spin_round_count,
+ (ulong) mutex_os_wait_count,
+ (ulong) rw_s_spin_wait_count,
+ (ulong) rw_s_os_wait_count,
+ (ulong) rw_x_spin_wait_count,
+ (ulong) rw_x_os_wait_count);
}
/***********************************************************************
diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c
index 6a509b163b3..558a0825fd7 100644
--- a/innobase/trx/trx0purge.c
+++ b/innobase/trx/trx0purge.c
@@ -277,7 +277,7 @@ trx_purge_add_update_undo_to_history(
if (undo->id >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: undo->id is %lu\n", undo->id);
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
ut_error;
}
@@ -920,7 +920,7 @@ trx_purge_fetch_next_rec(
if (srv_print_thread_releases) {
printf(
"Purge: No logs left in the history list; pages handled %lu\n",
- purge_sys->n_pages_handled);
+ (ulong) purge_sys->n_pages_handled);
}
mutex_exit(&(purge_sys->mutex));
@@ -1072,7 +1072,8 @@ trx_purge(void)
if (srv_print_thread_releases) {
printf(
- "Purge ends; pages handled %lu\n", purge_sys->n_pages_handled);
+ "Purge ends; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
}
return(purge_sys->n_pages_handled - old_pages_handled);
@@ -1089,16 +1090,16 @@ trx_purge_sys_print(void)
read_view_print(purge_sys->view);
fprintf(stderr, "InnoDB: Purge trx n:o %lu %lu, undo n_o %lu %lu\n",
- ut_dulint_get_high(purge_sys->purge_trx_no),
- ut_dulint_get_low(purge_sys->purge_trx_no),
- ut_dulint_get_high(purge_sys->purge_undo_no),
- ut_dulint_get_low(purge_sys->purge_undo_no));
+ (ulong) ut_dulint_get_high(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no),
+ (ulong) ut_dulint_get_high(purge_sys->purge_undo_no),
+ (ulong) ut_dulint_get_low(purge_sys->purge_undo_no));
fprintf(stderr,
"InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
"InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
- purge_sys->next_stored,
- purge_sys->page_no,
- purge_sys->offset,
- purge_sys->hdr_page_no,
- purge_sys->hdr_offset);
+ (ulong) purge_sys->next_stored,
+ (ulong) purge_sys->page_no,
+ (ulong) purge_sys->offset,
+ (ulong) purge_sys->hdr_page_no,
+ (ulong) purge_sys->hdr_offset);
}
diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c
index 9d944e16a1e..bd37a4b506b 100644
--- a/innobase/trx/trx0rec.c
+++ b/innobase/trx/trx0rec.c
@@ -825,15 +825,16 @@ trx_undo_update_rec_get_update(
fprintf(stderr,
"InnoDB: Error: trying to access update undo rec field %lu in table %s\n"
"InnoDB: index %s, but index has only %lu fields\n",
- field_no, index->table_name, index->name,
- dict_index_get_n_fields(index));
+ (ulong) field_no, index->table_name, index->name,
+ (ulong) dict_index_get_n_fields(index));
fprintf(stderr,
"InnoDB: Send a detailed bug report to mysql@lists.mysql.com");
fprintf(stderr,
"InnoDB: Run also CHECK TABLE on table %s\n", index->table_name);
fprintf(stderr,
- "InnoDB: n_fields = %lu, i = %lu, ptr %lx\n", n_fields, i, (ulint)ptr);
+ "InnoDB: n_fields = %lu, i = %lu, ptr %lx\n", (ulong) n_fields, (ulong) i,
+ (ulong) ptr);
return(NULL);
}
@@ -1336,17 +1337,18 @@ trx_undo_prev_version_build(
fprintf(stderr,
"InnoDB: Table name %s, index name %s, n_uniq %lu\n",
index->table_name, index->name,
- dict_index_get_n_unique(index));
+ (ulong) dict_index_get_n_unique(index));
fprintf(stderr,
"InnoDB: undo rec address %lx, type %lu cmpl_info %lu\n",
- (ulint)undo_rec, type, cmpl_info);
+ (ulong) undo_rec, (ulong) type,
+ (ulong) cmpl_info);
fprintf(stderr,
"InnoDB: undo rec table id %lu %lu, index table id %lu %lu\n",
- ut_dulint_get_high(table_id),
- ut_dulint_get_low(table_id),
- ut_dulint_get_high(index->table->id),
- ut_dulint_get_low(index->table->id));
+ (ulong) ut_dulint_get_high(table_id),
+ (ulong) ut_dulint_get_low(table_id),
+ (ulong) ut_dulint_get_high(index->table->id),
+ (ulong) ut_dulint_get_low(index->table->id));
ut_sprintf_buf(err_buf, undo_rec, 150);
@@ -1360,17 +1362,17 @@ trx_undo_prev_version_build(
fprintf(stderr,
"InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n",
- ut_dulint_get_high(rec_trx_id),
- ut_dulint_get_low(rec_trx_id),
- ut_dulint_get_high(trx_id),
- ut_dulint_get_low(trx_id));
+ (ulong) ut_dulint_get_high(rec_trx_id),
+ (ulong) ut_dulint_get_low(rec_trx_id),
+ (ulong) ut_dulint_get_high(trx_id),
+ (ulong) ut_dulint_get_low(trx_id));
fprintf(stderr,
"InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n",
- ut_dulint_get_high(old_roll_ptr),
- ut_dulint_get_low(old_roll_ptr),
- ut_dulint_get_high(roll_ptr),
- ut_dulint_get_low(roll_ptr));
+ (ulong) ut_dulint_get_high(old_roll_ptr),
+ (ulong) ut_dulint_get_low(old_roll_ptr),
+ (ulong) ut_dulint_get_high(roll_ptr),
+ (ulong) ut_dulint_get_low(roll_ptr));
trx_purge_sys_print();
diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c
index c00c6f0c862..6a25304c7ef 100644
--- a/innobase/trx/trx0roll.c
+++ b/innobase/trx/trx0roll.c
@@ -392,8 +392,8 @@ loop:
if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n",
- ut_dulint_get_high(trx->id),
- ut_dulint_get_low(trx->id));
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
trx_cleanup_at_db_startup(trx);
@@ -428,9 +428,9 @@ loop:
fprintf(stderr,
"InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo",
- ut_dulint_get_high(trx->id),
- ut_dulint_get_low(trx->id),
- (ulint)rows_to_undo, unit);
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id),
+ (ulong) rows_to_undo, unit);
mutex_exit(&kernel_mutex);
if (trx->dict_operation) {
@@ -447,7 +447,7 @@ loop:
fprintf(stderr,
"InnoDB: Waiting for rollback of trx id %lu to end\n",
- ut_dulint_get_low(trx->id));
+ (ulong) ut_dulint_get_low(trx->id));
os_thread_sleep(100000);
mutex_enter(&kernel_mutex);
@@ -461,8 +461,8 @@ loop:
fprintf(stderr,
"InnoDB: Dropping table with id %lu %lu in recovery if it exists\n",
- ut_dulint_get_high(trx->table_id),
- ut_dulint_get_low(trx->table_id));
+ (ulong) ut_dulint_get_high(trx->table_id),
+ (ulong) ut_dulint_get_low(trx->table_id));
table = dict_table_get_on_id_low(trx->table_id, trx);
@@ -481,8 +481,8 @@ loop:
}
fprintf(stderr, "\nInnoDB: Rolling back of trx id %lu %lu completed\n",
- ut_dulint_get_high(trx->id),
- ut_dulint_get_low(trx->id));
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
mem_heap_free(heap);
goto loop;
@@ -855,10 +855,10 @@ try_again:
if (progress_pct != trx_roll_progress_printed_pct) {
if (trx_roll_progress_printed_pct == 0) {
fprintf(stderr,
- "\nInnoDB: Progress in percents: %lu", progress_pct);
+ "\nInnoDB: Progress in percents: %lu", (ulong) progress_pct);
} else {
fprintf(stderr,
- " %lu", progress_pct);
+ " %lu", (ulong) progress_pct);
}
fflush(stderr);
trx_roll_progress_printed_pct = progress_pct;
@@ -1142,7 +1142,7 @@ trx_finish_rollback_off_kernel(
if (lock_print_waits) {
printf("Trx %lu rollback finished\n",
- ut_dulint_get_low(trx->id));
+ (ulong) ut_dulint_get_low(trx->id));
}
trx_commit_off_kernel(trx);
diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c
index d4c14a5509c..e7439935b7e 100644
--- a/innobase/trx/trx0sys.c
+++ b/innobase/trx/trx0sys.c
@@ -26,6 +26,17 @@ Created 3/26/1996 Heikki Tuuri
trx_sys_t* trx_sys = NULL;
trx_doublewrite_t* trx_doublewrite = NULL;
+/* The following is set to TRUE when we are upgrading from the old format data
+files to the new >= 4.1.x format multiple tablespaces format data files */
+
+ibool trx_doublewrite_must_reset_space_ids = FALSE;
+
+/* The following is TRUE when we are using the database in the new format,
+i.e., we have successfully upgraded, or have created a new database
+installation */
+
+ibool trx_sys_multiple_tablespace_format = FALSE;
+
/* In a MySQL replication slave, in crash recovery we store the master log
file name and position here. We have successfully got the updates to InnoDB
up to this position. If .._pos is -1, it means no crash recovery was needed,
@@ -34,45 +45,6 @@ or there was no master log position info inside InnoDB. */
char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
ib_longlong trx_sys_mysql_master_log_pos = -1;
-/* Do NOT merge this to the 4.1 code base! */
-ibool trx_sys_downgrading_from_4_1_1 = FALSE;
-
-/********************************************************************
-Do NOT merge this to the 4.1 code base!
-Marks the trx sys header when we have successfully downgraded from the >= 4.1.1
-multiple tablespace format back to the 4.0 format. */
-
-void
-trx_sys_mark_downgraded_from_4_1_1(void)
-/*====================================*/
-{
- page_t* page;
- byte* doublewrite;
- mtr_t mtr;
-
- /* Let us mark to the trx_sys header that the downgrade has been
- done. */
-
- mtr_start(&mtr);
-
- page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
-#ifdef UNIV_SYNC_DEBUG
- buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
-#endif /* UNIV_SYNC_DEBUG */
-
- doublewrite = page + TRX_SYS_DOUBLEWRITE;
-
- mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
- TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N + 1,
- MLOG_4BYTES, &mtr);
- mtr_commit(&mtr);
-
- /* Flush the modified pages to disk and make a checkpoint */
- log_make_checkpoint_at(ut_dulint_max, TRUE);
-
- trx_sys_downgrading_from_4_1_1 = FALSE;
-}
-
/********************************************************************
Determines if a page number is located inside the doublewrite buffer. */
@@ -114,11 +86,11 @@ trx_doublewrite_init(
{
trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
- /* When we have the doublewrite buffer in use, we do not need to
- call os_file_flush (Unix fsync) after every write. */
-
+ /* Since we now start to use the doublewrite buffer, no need to call
+ fsync() after every write to a data file */
+
os_do_not_call_flush_at_each_write = TRUE;
-
+
mutex_create(&(trx_doublewrite->mutex));
mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE);
@@ -144,7 +116,41 @@ trx_doublewrite_init(
}
/********************************************************************
-Creates the doublewrite buffer at a database start. The header of the
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void)
+/*===============================================*/
+{
+ page_t* page;
+ byte* doublewrite;
+ mtr_t mtr;
+
+ /* We upgraded to 4.1.x and reset the space id fields in the
+ doublewrite buffer. Let us mark to the trx_sys header that the upgrade
+ has been done. */
+
+ mtr_start(&mtr);
+
+ page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+ buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = page + TRX_SYS_DOUBLEWRITE;
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(ut_dulint_max, TRUE);
+
+ trx_sys_multiple_tablespace_format = TRUE;
+}
+
+/********************************************************************
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
doublewrite buffer is placed on the trx system header page. */
void
@@ -179,31 +185,6 @@ start_again:
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
- /* Do NOT merge to the 4.1 code base! */
- if (mach_read_from_4(doublewrite
- + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
- == TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
- fprintf(stderr,
-"InnoDB: You are downgrading from the multiple tablespace format of\n"
-"InnoDB: >= MySQL-4.1.1 back to the old format of MySQL-4.0.\n"
-"InnoDB:\n"
-"InnoDB: MAKE SURE that the mysqld server is idle, and purge and the insert\n"
-"InnoDB: buffer merge have run to completion under >= 4.1.1 before trying to\n"
-"InnoDB: downgrade! You can determine this by looking at SHOW INNODB STATUS:\n"
-"InnoDB: if the Main thread is 'waiting for server activity' and SHOW\n"
-"InnoDB: PROCESSLIST shows that you have ended all other connections\n"
-"InnoDB: to mysqld, then purge and the insert buffer merge have been\n"
-"InnoDB: completed.\n"
-"InnoDB: If you have already created tables in >= 4.1.1, then those\n"
-"InnoDB: tables cannot be used under 4.0.\n"
-"InnoDB: NOTE THAT this downgrade procedure has not been properly tested!\n"
-"InnoDB: The safe way to downgrade is to dump all InnoDB tables and recreate\n"
-"InnoDB: the whole tablespace.\n");
-
- trx_sys_downgrading_from_4_1_1 = TRUE;
- }
-
/* The doublewrite buffer has already been created:
just read in some numbers */
@@ -313,10 +294,15 @@ start_again:
}
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
- TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ TRX_SYS_DOUBLEWRITE_REPEAT,
- TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+ TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
mtr_commit(&mtr);
/* Flush the modified pages to disk and make a checkpoint */
@@ -324,23 +310,31 @@ start_again:
fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+ trx_sys_multiple_tablespace_format = TRUE;
+
goto start_again;
}
}
/********************************************************************
-At a database startup uses a possible doublewrite buffer to restore
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
half-written pages in the data files. */
void
-trx_sys_doublewrite_restore_corrupt_pages(void)
-/*===========================================*/
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages)
{
byte* buf;
byte* read_buf;
byte* unaligned_read_buf;
ulint block1;
ulint block2;
+ ulint source_page_no;
byte* page;
byte* doublewrite;
ulint space_id;
@@ -352,43 +346,17 @@ trx_sys_doublewrite_restore_corrupt_pages(void)
unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
- /* Read the trx sys header to check if we are using the
- doublewrite buffer */
+ /* Read the trx sys header to check if we are using the doublewrite
+ buffer */
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0,
UNIV_PAGE_SIZE, read_buf, NULL);
-
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
/* The doublewrite buffer has been created */
- /* Do NOT merge to the 4.1 code base! */
- if (mach_read_from_4(doublewrite
- + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
- == TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
- fprintf(stderr,
-"InnoDB: You are downgrading from the multiple tablespace format of\n"
-"InnoDB: >= MySQL-4.1.1 back to the old format of MySQL-4.0.\n"
-"InnoDB:\n"
-"InnoDB: MAKE SURE that the mysqld server is idle, and purge and the insert\n"
-"InnoDB: buffer merge have run to completion under >= 4.1.1 before trying to\n"
-"InnoDB: downgrade! You can determine this by looking at SHOW INNODB STATUS:\n"
-"InnoDB: if the Main thread is 'waiting for server activity' and SHOW\n"
-"InnoDB: PROCESSLIST shows that you have ended all other connections\n"
-"InnoDB: to mysqld, then purge and the insert buffer merge have been\n"
-"InnoDB: completed.\n"
-"InnoDB: If you have already created tables in >= 4.1.1, then those\n"
-"InnoDB: tables cannot be used under 4.0.\n"
-"InnoDB: NOTE THAT this downgrade procedure has not been properly tested!\n"
-"InnoDB: The safe way to downgrade is to dump all InnoDB tables and recreate\n"
-"InnoDB: the whole tablespace.\n");
-
- trx_sys_downgrading_from_4_1_1 = TRUE;
- }
-
trx_doublewrite_init(doublewrite);
block1 = trx_doublewrite->block1;
@@ -399,6 +367,23 @@ trx_sys_doublewrite_restore_corrupt_pages(void)
goto leave_func;
}
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+ != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+ /* We are upgrading from a version < 4.1.x to a version where
+ multiple tablespaces are supported. We must reset the space id
+ field in the pages in the doublewrite buffer because starting
+ from this version the space id is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+ trx_doublewrite_must_reset_space_ids = TRUE;
+
+ fprintf(stderr,
+"InnoDB: Resetting space id's in the doublewrite buffer\n");
+ } else {
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+
/* Read the pages from the doublewrite buffer to memory */
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0,
@@ -416,13 +401,46 @@ trx_sys_doublewrite_restore_corrupt_pages(void)
for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
- space_id = 0;
- if (!fil_check_adress_in_tablespace(space_id, page_no)) {
+ if (trx_doublewrite_must_reset_space_ids) {
+
+ space_id = 0;
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+ /* We do not need to calculate new checksums for the
+ pages because the field .._SPACE_ID does not affect
+ them. Write the page back to where we read it from. */
+
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ source_page_no = block1 + i;
+ } else {
+ source_page_no = block2
+ + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ /* printf("Resetting space id in page %lu\n",
+ source_page_no); */
+ } else {
+ space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ if (!restore_corrupt_pages) {
+ /* The database was shut down gracefully: no need to
+ restore pages */
+
+ } else if (!fil_tablespace_exists_in_mem(space_id)) {
+ /* Maybe we have dropped the single-table tablespace
+ and this page once belonged to it: do nothing */
+
+ } else if (!fil_check_adress_in_tablespace(space_id,
+ page_no)) {
fprintf(stderr,
- "InnoDB: Warning: an inconsistent page in the doublewrite buffer\n"
- "InnoDB: space id %lu page number %lu, %lu'th page in dblwr buf.\n",
- space_id, page_no, i);
+"InnoDB: Warning: a page in the doublewrite buffer is not within space\n"
+"InnoDB: bounds; space id %lu page number %lu, page %lu in doublewrite buf.\n",
+ (ulong) space_id, (ulong) page_no, (ulong) i);
} else if (space_id == TRX_SYS_SPACE
&& ( (page_no >= block1
@@ -445,7 +463,7 @@ trx_sys_doublewrite_restore_corrupt_pages(void)
fprintf(stderr,
"InnoDB: Warning: database page corruption or a failed\n"
- "InnoDB: file read of page %lu.\n", page_no);
+ "InnoDB: file read of page %lu.\n", (ulong) page_no);
fprintf(stderr,
"InnoDB: Trying to recover it from the doublewrite buffer.\n");
@@ -579,8 +597,8 @@ trx_sys_update_mysql_binlog_offset(
if (0 != ut_memcmp(sys_header + field + TRX_SYS_MYSQL_LOG_NAME,
file_name, 1 + ut_strlen(file_name))) {
- mlog_write_string((byte*) (sys_header + field
- + TRX_SYS_MYSQL_LOG_NAME),
+ mlog_write_string(sys_header + field
+ + TRX_SYS_MYSQL_LOG_NAME,
(byte*) file_name, 1 + ut_strlen(file_name), mtr);
}
@@ -596,8 +614,8 @@ trx_sys_update_mysql_binlog_offset(
mlog_write_ulint(sys_header + field
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW,
- (ulint)(offset & 0xFFFFFFFF),
- MLOG_4BYTES, mtr);
+ (ulint)(offset & 0xFFFFFFFFUL),
+ MLOG_4BYTES, mtr);
}
/*********************************************************************
@@ -620,9 +638,9 @@ trx_sys_print_mysql_binlog_offset_from_page(
printf(
"ibbackup: Last MySQL binlog file position %lu %lu, file name %s\n",
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME);
}
@@ -654,9 +672,9 @@ trx_sys_print_mysql_binlog_offset(void)
fprintf(stderr,
"InnoDB: Last MySQL binlog file position %lu %lu, file name %s\n",
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
sys_header + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME);
@@ -690,9 +708,9 @@ trx_sys_print_mysql_master_log_pos(void)
fprintf(stderr,
"InnoDB: In a MySQL replication slave the last master binlog file\n"
"InnoDB: position %lu %lu, file name %s\n",
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
- mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ (ulong) mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_NAME);
@@ -867,12 +885,12 @@ trx_sys_init_at_db_start(void)
fprintf(stderr,
"InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n"
"InnoDB: in total %lu%s row operations to undo\n",
- UT_LIST_GET_LEN(trx_sys->trx_list),
- (ulint)rows_to_undo, unit);
+ (ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+ (ulong) rows_to_undo, unit);
fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n",
- ut_dulint_get_high(trx_sys->max_trx_id),
- ut_dulint_get_low(trx_sys->max_trx_id));
+ (ulong) ut_dulint_get_high(trx_sys->max_trx_id),
+ (ulong) ut_dulint_get_low(trx_sys->max_trx_id));
}
UT_LIST_INIT(trx_sys->view_list);
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index 9b6e6904537..b509d80e452 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -107,7 +107,7 @@ trx_create(
trx->mysql_log_file_name = NULL;
trx->mysql_log_offset = 0;
- trx->mysql_master_log_file_name = (char*) "";
+ trx->mysql_master_log_file_name = (char*)"";
trx->mysql_master_log_pos = 0;
mutex_create(&(trx->undo_mutex));
@@ -1573,26 +1573,26 @@ trx_print(
char* start_of_line;
buf += sprintf(buf, "TRANSACTION %lu %lu",
- ut_dulint_get_high(trx->id),
- ut_dulint_get_low(trx->id));
+ (ulong) ut_dulint_get_high(trx->id),
+ (ulong) ut_dulint_get_low(trx->id));
switch (trx->conc_state) {
case TRX_NOT_STARTED: buf += sprintf(buf,
", not started"); break;
case TRX_ACTIVE: buf += sprintf(buf,
", ACTIVE %lu sec",
- (ulint)difftime(time(NULL), trx->start_time)); break;
+ (ulong) difftime(time(NULL), trx->start_time)); break;
case TRX_COMMITTED_IN_MEMORY: buf += sprintf(buf,
", COMMITTED IN MEMORY");
break;
- default: buf += sprintf(buf, " state %lu", trx->conc_state);
+ default: buf += sprintf(buf, " state %lu", (ulong) trx->conc_state);
}
#ifdef UNIV_LINUX
buf += sprintf(buf, ", process no %lu", trx->mysql_process_no);
#endif
buf += sprintf(buf, ", OS thread id %lu",
- os_thread_pf(trx->mysql_thread_id));
+ (ulong) os_thread_pf(trx->mysql_thread_id));
if (ut_strlen(trx->op_info) > 0) {
buf += sprintf(buf, " %s", trx->op_info);
@@ -1604,18 +1604,18 @@ trx_print(
if (trx->declared_to_be_inside_innodb) {
buf += sprintf(buf, ", thread declared inside InnoDB %lu",
- trx->n_tickets_to_enter_innodb);
+ (ulong) trx->n_tickets_to_enter_innodb);
}
buf += sprintf(buf, "\n");
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
- if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
- buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n",
- trx->n_mysql_tables_in_use,
- trx->mysql_n_tables_locked);
- }
-
start_of_line = buf;
switch (trx->que_state) {
@@ -1626,7 +1626,7 @@ trx_print(
"ROLLING BACK "); break;
case TRX_QUE_COMMITTING: buf += sprintf(buf,
"COMMITTING "); break;
- default: buf += sprintf(buf, "que state %lu", trx->que_state);
+ default: buf += sprintf(buf, "que state %lu", (ulong) trx->que_state);
}
if (0 < UT_LIST_GET_LEN(trx->trx_locks) ||
@@ -1634,8 +1634,8 @@ trx_print(
buf += sprintf(buf,
"%lu lock struct(s), heap size %lu",
- UT_LIST_GET_LEN(trx->trx_locks),
- mem_heap_get_size(trx->lock_heap));
+ (ulong) UT_LIST_GET_LEN(trx->trx_locks),
+ (ulong) mem_heap_get_size(trx->lock_heap));
}
if (trx->has_search_latch) {
@@ -1644,7 +1644,7 @@ trx_print(
if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) {
buf += sprintf(buf, ", undo log entries %lu",
- ut_dulint_get_low(trx->undo_no));
+ (ulong) ut_dulint_get_low(trx->undo_no));
}
if (buf != start_of_line) {
diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c
index de3da382e83..f2b7227d84a 100644
--- a/innobase/trx/trx0undo.c
+++ b/innobase/trx/trx0undo.c
@@ -387,6 +387,7 @@ trx_undo_seg_create(
page_t* undo_page;
trx_upagef_t* page_hdr;
trx_usegf_t* seg_hdr;
+ ulint n_reserved;
ibool success;
ut_ad(mtr && id && rseg_hdr);
@@ -413,8 +414,8 @@ trx_undo_seg_create(
space = buf_frame_get_space_id(rseg_hdr);
- success = fsp_reserve_free_extents(space, 2, FSP_UNDO, mtr);
-
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
if (!success) {
return(NULL);
@@ -424,7 +425,7 @@ trx_undo_seg_create(
undo_page = fseg_create_general(space, 0,
TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
- fil_space_release_free_extents(space, 2);
+ fil_space_release_free_extents(space, n_reserved);
if (undo_page == NULL) {
/* No space left */
@@ -737,6 +738,7 @@ trx_undo_add_page(
page_t* new_page;
trx_rseg_t* rseg;
ulint page_no;
+ ulint n_reserved;
ibool success;
#ifdef UNIV_SYNC_DEBUG
@@ -754,8 +756,8 @@ trx_undo_add_page(
header_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr);
- success = fsp_reserve_free_extents(undo->space, 1, FSP_UNDO, mtr);
-
+ success = fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+ FSP_UNDO, mtr);
if (!success) {
return(FIL_NULL);
@@ -766,7 +768,7 @@ trx_undo_add_page(
undo->top_page_no + 1, FSP_UP,
TRUE, mtr);
- fil_space_release_free_extents(undo->space, 1);
+ fil_space_release_free_extents(undo->space, n_reserved);
if (page_no == FIL_NULL) {
@@ -1127,7 +1129,7 @@ trx_undo_mem_create_at_db_start(
if (id >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: undo->id is %lu\n", id);
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
ut_error;
}
@@ -1285,7 +1287,7 @@ trx_undo_mem_create(
if (id >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: undo->id is %lu\n", id);
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
ut_error;
}
@@ -1330,7 +1332,8 @@ trx_undo_mem_init_for_reuse(
#endif /* UNIV_SYNC_DEBUG */
if (undo->id >= TRX_RSEG_N_SLOTS) {
- fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id);
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
mem_analyze_corruption((byte*)undo);
ut_error;
@@ -1356,7 +1359,7 @@ trx_undo_mem_free(
{
if (undo->id >= TRX_RSEG_N_SLOTS) {
fprintf(stderr,
- "InnoDB: Error: undo->id is %lu\n", undo->id);
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
ut_error;
}
@@ -1466,7 +1469,8 @@ trx_undo_reuse_cached(
ut_ad(undo->size == 1);
if (undo->id >= TRX_RSEG_N_SLOTS) {
- fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id);
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
mem_analyze_corruption((byte*)undo);
ut_error;
}
@@ -1602,7 +1606,8 @@ trx_undo_set_state_at_finish(
ut_ad(trx && undo && mtr);
if (undo->id >= TRX_RSEG_N_SLOTS) {
- fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", undo->id);
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
mem_analyze_corruption((byte*)undo);
ut_error;
}
diff --git a/innobase/ut/ut0byte.c b/innobase/ut/ut0byte.c
index 02bdf2065ee..74198419560 100644
--- a/innobase/ut/ut0byte.c
+++ b/innobase/ut/ut0byte.c
@@ -18,7 +18,7 @@ Created 5/11/1994 Heikki Tuuri
dulint ut_dulint_zero = {0, 0};
/* Maximum value for a dulint */
-dulint ut_dulint_max = {0xFFFFFFFF, 0xFFFFFFFF};
+dulint ut_dulint_max = {0xFFFFFFFFUL, 0xFFFFFFFFUL};
/****************************************************************
Sort function for dulint arrays. */
diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c
index eca738f0924..65229335a09 100644
--- a/innobase/ut/ut0mem.c
+++ b/innobase/ut/ut0mem.c
@@ -77,8 +77,9 @@ ut_malloc_low(
ret = malloc(n + sizeof(ut_mem_block_t));
if (ret == NULL) {
+ ut_print_timestamp(stderr);
fprintf(stderr,
- "InnoDB: Fatal error: cannot allocate %lu bytes of\n"
+ " InnoDB: Fatal error: cannot allocate %lu bytes of\n"
"InnoDB: memory with malloc! Total allocated memory\n"
"InnoDB: by InnoDB %lu bytes. Operating system errno: %lu\n"
"InnoDB: Cannot continue operation!\n"
@@ -88,11 +89,11 @@ ut_malloc_low(
"InnoDB: a big enough maximum process size.\n"
"InnoDB: We now intentionally generate a seg fault so that\n"
"InnoDB: on Linux we get a stack trace.\n",
- n, ut_total_allocated_memory,
+ (ulong) n, (ulong) ut_total_allocated_memory,
#ifdef __WIN__
- (ulint)GetLastError()
+ (ulong) GetLastError()
#else
- (ulint)errno
+ (ulong) errno
#endif
);
@@ -106,7 +107,7 @@ ut_malloc_low(
/* Make an intentional seg fault so that we get a stack
trace */
- printf("%lu\n", *ut_mem_null_ptr);
+ printf("%lu\n", (ulong) *ut_mem_null_ptr);
}
if (set_to_zero) {
@@ -141,6 +142,42 @@ ut_malloc(
}
/**************************************************************************
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails. */
+
+ibool
+ut_test_malloc(
+/*===========*/
+ /* out: TRUE if succeeded */
+ ulint n) /* in: try to allocate this many bytes */
+{
+ void* ret;
+
+ ret = malloc(n);
+
+ if (ret == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: cannot allocate %lu bytes of memory for\n"
+ "InnoDB: a BLOB with malloc! Total allocated memory\n"
+ "InnoDB: by InnoDB %lu bytes. Operating system errno: %d\n"
+ "InnoDB: Check if you should increase the swap file or\n"
+ "InnoDB: ulimits of your operating system.\n"
+ "InnoDB: On FreeBSD check you have compiled the OS with\n"
+ "InnoDB: a big enough maximum process size.\n",
+ (ulong) n,
+ (ulong) ut_total_allocated_memory,
+ (int) errno);
+ return(FALSE);
+ }
+
+ free(ret);
+
+ return(TRUE);
+}
+
+/**************************************************************************
Frees a memory block allocated with ut_malloc. */
void
@@ -190,7 +227,7 @@ ut_free_all_mem(void)
if (ut_total_allocated_memory != 0) {
fprintf(stderr,
"InnoDB: Warning: after shutdown total allocated memory is %lu\n",
- ut_total_allocated_memory);
+ (ulong) ut_total_allocated_memory);
}
}
diff --git a/innobase/ut/ut0rnd.c b/innobase/ut/ut0rnd.c
index 3335861384f..85d2e6094c3 100644
--- a/innobase/ut/ut0rnd.c
+++ b/innobase/ut/ut0rnd.c
@@ -71,9 +71,8 @@ ut_find_prime(
/* Found a prime */
break;
- next_n: ;
+next_n: ;
}
return(n);
}
-
diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c
index be311764261..77f7a997777 100644
--- a/innobase/ut/ut0ut.c
+++ b/innobase/ut/ut0ut.c
@@ -20,26 +20,224 @@ Created 5/11/1994 Heikki Tuuri
ibool ut_always_false = FALSE;
/************************************************************
-Uses vsprintf to emulate sprintf so that the function always returns
-the printed length. Apparently in some old SCO Unixes sprintf did not
-return the printed length but a pointer to the end of the printed string. */
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
+
+int
+ut_printf(
+/*======*/
+ /* out: the number of characters written, or
+ negative in case of an error */
+ const char* format, /* in: format of prints */
+ ...) /* in: arguments to be printed */
+{
+ va_list args;
+ ulint len;
+ char* format_end;
+ char* newformat;
+ char* ptr;
+ char* newptr;
+ int ret;
+ char format_buf_in_stack[500];
+
+ len = strlen(format);
+
+ if (len > 250) {
+ newformat = malloc(2 * len);
+ } else {
+ newformat = format_buf_in_stack;
+ }
-ulint
+ format_end = (char*)format + len;
+
+ ptr = (char*)format;
+ newptr = newformat;
+
+#if defined(__WIN__) && (defined(WIN64) || defined(_WIN64))
+ /* Replace %l with %I64 if it is not preceded with '\' */
+
+ while (ptr < format_end) {
+ if (*ptr == '%' && *(ptr + 1) == 'l'
+ && (ptr == format || *(ptr - 1) != '\\')) {
+
+ memcpy(newptr, "%I64", 4);
+ ptr += 2;
+ newptr += 4;
+ } else {
+ *newptr = *ptr;
+ ptr++;
+ newptr++;
+ }
+ }
+
+ *newptr = '\0';
+
+ ut_a(newptr < newformat + 2 * len);
+#else
+ strcpy(newformat, format);
+#endif
+ va_start(args, format);
+
+ ret = vprintf((const char*)newformat, args);
+
+ va_end(args);
+
+ if (newformat != format_buf_in_stack) {
+ free(newformat);
+ }
+
+ return(ret);
+}
+
+/************************************************************
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
+
+int
ut_sprintf(
/*=======*/
- char* buf, /* in/out: buffer where to print */
+ /* out: the number of characters written, or
+ negative in case of an error */
+ char* buf, /* in: buffer where to print */
const char* format, /* in: format of prints */
...) /* in: arguments to be printed */
{
- va_list args;
-
+ va_list args;
+ ulint len;
+ char* format_end;
+ char* newformat;
+ char* ptr;
+ char* newptr;
+ int ret;
+ char format_buf_in_stack[500];
+
+ len = strlen(format);
+
+ if (len > 250) {
+ newformat = malloc(2 * len);
+ } else {
+ newformat = format_buf_in_stack;
+ }
+
+ format_end = (char*)format + len;
+
+ ptr = (char*)format;
+ newptr = newformat;
+
+#if defined(__WIN__) && (defined(WIN64) || defined(_WIN64))
+ /* Replace %l with %I64 if it is not preceded with '\' */
+
+ while (ptr < format_end) {
+ if (*ptr == '%' && *(ptr + 1) == 'l'
+ && (ptr == format || *(ptr - 1) != '\\')) {
+
+ memcpy(newptr, "%I64", 4);
+ ptr += 2;
+ newptr += 4;
+ } else {
+ *newptr = *ptr;
+ ptr++;
+ newptr++;
+ }
+ }
+
+ *newptr = '\0';
+
+ ut_a(newptr < newformat + 2 * len);
+#else
+ strcpy(newformat, format);
+#endif
va_start(args, format);
- vsprintf(buf, format, args);
+ ret = vsprintf(buf, (const char*)newformat, args);
va_end(args);
- return((ulint)strlen(buf));
+ if (newformat != format_buf_in_stack) {
+ free(newformat);
+ }
+
+ return(ret);
+}
+
+/************************************************************
+On the 64-bit Windows we substitute the format string
+%l -> %I64
+because we define ulint as unsigned __int64 and lint as __int64 on Windows,
+and both the Microsoft and Intel C compilers require the format string
+%I64 in that case instead of %l. */
+
+int
+ut_fprintf(
+/*=======*/
+ /* out: the number of characters written, or
+ negative in case of an error */
+ FILE* stream, /* in: stream where to print */
+ const char* format, /* in: format of prints */
+ ...) /* in: arguments to be printed */
+{
+ va_list args;
+ ulint len;
+ char* format_end;
+ char* newformat;
+ char* ptr;
+ char* newptr;
+ int ret;
+ char format_buf_in_stack[500];
+
+ len = strlen(format);
+
+ if (len > 250) {
+ newformat = malloc(2 * len);
+ } else {
+ newformat = format_buf_in_stack;
+ }
+
+ format_end = (char*)format + len;
+
+ ptr = (char*)format;
+ newptr = newformat;
+
+#if defined(__WIN__) && (defined(WIN64) || defined(_WIN64))
+ /* Replace %l with %I64 if it is not preceded with '\' */
+
+ while (ptr < format_end) {
+ if (*ptr == '%' && *(ptr + 1) == 'l'
+ && (ptr == format || *(ptr - 1) != '\\')) {
+
+ memcpy(newptr, "%I64", 4);
+ ptr += 2;
+ newptr += 4;
+ } else {
+ *newptr = *ptr;
+ ptr++;
+ newptr++;
+ }
+ }
+
+ *newptr = '\0';
+
+ ut_a(newptr < newformat + 2 * len);
+#else
+ strcpy(newformat, format);
+#endif
+ va_start(args, format);
+
+ ret = vfprintf(stream, (const char*)newformat, args);
+
+ va_end(args);
+
+ if (newformat != format_buf_in_stack) {
+ free(newformat);
+ }
+
+ return(ret);
}
/************************************************************
@@ -63,7 +261,7 @@ ut_get_high32(
}
/************************************************************
-The following function returns a clock time in milliseconds. */
+The following function returns elapsed CPU time in milliseconds. */
ulint
ut_clock(void)
@@ -182,6 +380,50 @@ ut_sprintf_timestamp(
}
/**************************************************************
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+ char* buf) /* in: buffer where to sprintf */
+{
+#ifdef __WIN__
+ SYSTEMTIME cal_tm;
+
+ GetLocalTime(&cal_tm);
+
+ sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+ (int)cal_tm.wYear % 100,
+ (int)cal_tm.wMonth,
+ (int)cal_tm.wDay,
+ (int)cal_tm.wHour,
+ (int)cal_tm.wMinute,
+ (int)cal_tm.wSecond);
+#else
+ struct tm cal_tm;
+ struct tm* cal_tm_ptr;
+ time_t tm;
+
+ time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+ localtime_r(&tm, &cal_tm);
+ cal_tm_ptr = &cal_tm;
+#else
+ cal_tm_ptr = localtime(&tm);
+#endif
+ sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+ cal_tm_ptr->tm_year % 100,
+ cal_tm_ptr->tm_mon + 1,
+ cal_tm_ptr->tm_mday,
+ cal_tm_ptr->tm_hour,
+ cal_tm_ptr->tm_min,
+ cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**************************************************************
Returns current year, month, day. */
void
@@ -232,7 +474,7 @@ ut_delay(
}
if (ut_always_false) {
- printf("%lu", j);
+ printf("%lu", (ulong) j);
}
return(j);
@@ -250,12 +492,12 @@ ut_print_buf(
byte* data;
ulint i;
- printf(" len %lu; hex ", len);
+ printf(" len %lu; hex ", (ulong) len);
data = buf;
for (i = 0; i < len; i++) {
- printf("%02lx", (ulint)*data);
+ printf("%02lx", (ulong) *data);
data++;
}
@@ -290,12 +532,12 @@ ut_sprintf_buf(
n = 0;
- n += sprintf(str + n, " len %lu; hex ", len);
+ n += sprintf(str + n, " len %lu; hex ", (ulong) len);
data = buf;
for (i = 0; i < len; i++) {
- n += sprintf(str + n, "%02lx", (ulint)*data);
+ n += sprintf(str + n, "%02lx", (ulong) *data);
data++;
}