From 039b8782d4794f34c5f0219d8a8d21f6e21d74f1 Mon Sep 17 00:00:00 2001 From: Eugene Kosov Date: Wed, 12 Jun 2019 22:36:43 +0300 Subject: MDEV-13631 Make use of clang-format Explicitly mention every options in .clang-format to protect us from possible future changes. Remove separate InnoDB style. Change style to look more like this script: for x in $@ do indent -kr -bl -bli0 -l79 -i2 -nut -c48 -dj -cp0 $x sed -ri -e 's/ = /= /g'\ -e '/switch.*\)$/{N;s/\n[ ]+/ /}' $x done Significant different is that 'switch' and '{' are put on different lines because it's impossible in clang-format to set formatting rules just for 'switch' statement. --- .clang-format | 123 +++++++++++++++++++++++++++++++++++++---- storage/innobase/.clang-format | 10 ---- 2 files changed, 111 insertions(+), 22 deletions(-) delete mode 100644 storage/innobase/.clang-format diff --git a/.clang-format b/.clang-format index 3b735b16d74..1ad93ead80a 100644 --- a/.clang-format +++ b/.clang-format @@ -1,18 +1,117 @@ -SpaceBeforeAssignmentOperators: false -SpaceAfterCStyleCast: true - -BreakBeforeBraces: Custom +--- +Language: Cpp +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +BinPackArguments: true +BinPackParameters: true BraceWrapping: - AfterClass: true + AfterCaseLabel: true + AfterClass: true AfterControlStatement: true - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterStruct: true - AfterUnion: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterStruct: true + AfterUnion: true AfterExternBlock: true - BeforeCatch: true - BeforeElse: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false SplitEmptyFunction: true SplitEmptyRecord: true SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakStringLiterals: true +ColumnLimit: 79 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: true +Language: Cpp +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +... diff --git a/storage/innobase/.clang-format b/storage/innobase/.clang-format deleted file mode 100644 index f7a72f3cf24..00000000000 --- a/storage/innobase/.clang-format +++ /dev/null @@ -1,10 +0,0 @@ -UseTab: Always -TabWidth: 8 -IndentWidth: 8 -BreakBeforeBinaryOperators: All -PointerAlignment: Left -AlwaysBreakAfterReturnType: TopLevel -BreakBeforeBraces: Custom -BraceWrapping: - AfterFunction: true -AccessModifierOffset: -8 -- cgit v1.2.1 From 65e0c9b91b46e2dfb4388c8c5c1bc76dd9f8fbd8 Mon Sep 17 00:00:00 2001 From: Alexey Botchkov Date: Sat, 15 Jun 2019 01:02:55 +0400 Subject: MDEV-18661 loading the audit plugin causes performance regression. Plugin fixed to not lock the LOCK_operations when not active. Server fixed to lock the LOCK_plugin less - do it once per thread and then only if a plugin was installed/uninstalled. --- plugin/server_audit/server_audit.c | 112 ++++++++++++++++++++++--------------- sql/sql_audit.cc | 16 ++++++ sql/sql_audit.h | 1 + sql/sql_class.cc | 4 +- sql/sql_class.h | 1 + sql/sql_connect.cc | 3 +- sql/sql_plugin.cc | 3 + sql/sql_plugin.h | 1 + sql/threadpool_common.cc | 3 +- 9 files changed, 97 insertions(+), 47 deletions(-) diff --git a/plugin/server_audit/server_audit.c b/plugin/server_audit/server_audit.c index c9e7e3a532a..3da52643787 100644 --- a/plugin/server_audit/server_audit.c +++ b/plugin/server_audit/server_audit.c @@ -15,7 +15,7 @@ #define PLUGIN_VERSION 0x104 -#define PLUGIN_STR_VERSION "1.4.4" +#define PLUGIN_STR_VERSION "1.4.7" #define _my_thread_var loc_thread_var @@ -295,7 +295,7 @@ static unsigned long long file_rotate_size; static unsigned int rotations; static my_bool rotate= TRUE; static char logging; -static int internal_stop_logging= 0; +static volatile int internal_stop_logging= 0; static char incl_user_buffer[1024]; static char excl_user_buffer[1024]; static char *big_buffer= NULL; @@ -533,16 +533,20 @@ static struct st_mysql_show_var audit_status[]= #if defined(HAVE_PSI_INTERFACE) && !defined(FLOGGER_NO_PSI) /* These belong to the service initialization */ static PSI_mutex_key key_LOCK_operations; +static PSI_mutex_key key_LOCK_atomic; static PSI_mutex_key key_LOCK_bigbuffer; static PSI_mutex_info mutex_key_list[]= { { &key_LOCK_operations, "SERVER_AUDIT_plugin::lock_operations", PSI_FLAG_GLOBAL}, + { &key_LOCK_atomic, "SERVER_AUDIT_plugin::lock_atomic", + PSI_FLAG_GLOBAL}, { &key_LOCK_bigbuffer, "SERVER_AUDIT_plugin::lock_bigbuffer", PSI_FLAG_GLOBAL} }; #endif static mysql_mutex_t lock_operations; +static mysql_mutex_t lock_atomic; static mysql_mutex_t lock_bigbuffer; /* The Percona server and partly MySQL don't support */ @@ -553,6 +557,14 @@ static mysql_mutex_t lock_bigbuffer; /* worths doing. */ #define CLIENT_ERROR if (!started_mysql) my_printf_error +#define ADD_ATOMIC(x, a) \ + do { \ + flogger_mutex_lock(&lock_atomic); \ + x+= a; \ + flogger_mutex_unlock(&lock_atomic); \ + } while (0) + + static uchar *getkey_user(const char *entry, size_t *length, my_bool nu __attribute__((unused)) ) { @@ -731,20 +743,20 @@ static int user_coll_fill(struct user_coll *c, char *users, if (cmp_user && take_over_cmp) { - internal_stop_logging= 1; + ADD_ATOMIC(internal_stop_logging, 1); CLIENT_ERROR(1, "User '%.*s' was removed from the" " server_audit_excl_users.", MYF(ME_JUST_WARNING), (int) cmp_length, users); - internal_stop_logging= 0; + ADD_ATOMIC(internal_stop_logging, -1); blank_user(cmp_user); refill_cmp_coll= 1; } else if (cmp_user) { - internal_stop_logging= 1; + ADD_ATOMIC(internal_stop_logging, 1); CLIENT_ERROR(1, "User '%.*s' is in the server_audit_incl_users, " "so wasn't added.", MYF(ME_JUST_WARNING), (int) cmp_length, users); - internal_stop_logging= 0; + ADD_ATOMIC(internal_stop_logging, -1); remove_user(users); continue; } @@ -1252,23 +1264,30 @@ static void change_connection(struct connection_info *cn, event->ip, event->ip_length); } -static int write_log(const char *message, int len) +static int write_log(const char *message, size_t len, int take_lock) { + int result= 0; + if (take_lock) + flogger_mutex_lock(&lock_operations); + if (output_type == OUTPUT_FILE) { if (logfile && - (is_active= (logger_write(logfile, message, len) == len))) - return 0; + (is_active= (logger_write(logfile, message, len) == (int) len))) + goto exit; ++log_write_failures; - return 1; + result= 1; } else if (output_type == OUTPUT_SYSLOG) { syslog(syslog_facility_codes[syslog_facility] | syslog_priority_codes[syslog_priority], - "%s %.*s", syslog_info, len, message); + "%s %.*s", syslog_info, (int) len, message); } - return 0; +exit: + if (take_lock) + flogger_mutex_unlock(&lock_operations); + return result; } @@ -1327,7 +1346,7 @@ static int log_connection(const struct connection_info *cn, csize+= my_snprintf(message+csize, sizeof(message) - 1 - csize, ",%.*s,,%d", cn->db_length, cn->db, event->status); message[csize]= '\n'; - return write_log(message, csize + 1); + return write_log(message, csize + 1, 1); } @@ -1348,7 +1367,7 @@ static int log_connection_event(const struct mysql_event_connection *event, csize+= my_snprintf(message+csize, sizeof(message) - 1 - csize, ",%.*s,,%d", event->database_length, event->database, event->status); message[csize]= '\n'; - return write_log(message, csize + 1); + return write_log(message, csize + 1, 1); } @@ -1477,21 +1496,28 @@ no_password: -static int do_log_user(const char *name) +static int do_log_user(const char *name, int take_lock) { size_t len; + int result; if (!name) return 0; len= strlen(name); - if (incl_user_coll.n_users) - return coll_search(&incl_user_coll, name, len) != 0; + if (take_lock) + flogger_mutex_lock(&lock_operations); - if (excl_user_coll.n_users) - return coll_search(&excl_user_coll, name, len) == 0; + if (incl_user_coll.n_users) + result= coll_search(&incl_user_coll, name, len) != 0; + else if (excl_user_coll.n_users) + result= coll_search(&excl_user_coll, name, len) == 0; + else + result= 1; - return 1; + if (take_lock) + flogger_mutex_unlock(&lock_operations); + return result; } @@ -1588,7 +1614,7 @@ not_in_list: static int log_statement_ex(const struct connection_info *cn, time_t ev_time, unsigned long thd_id, const char *query, unsigned int query_len, - int error_code, const char *type) + int error_code, const char *type, int take_lock) { size_t csize; char message_loc[1024]; @@ -1736,7 +1762,7 @@ do_log_query: csize+= my_snprintf(message+csize, message_size - 1 - csize, "\',%d", error_code); message[csize]= '\n'; - result= write_log(message, csize + 1); + result= write_log(message, csize + 1, take_lock); if (message == big_buffer) flogger_mutex_unlock(&lock_bigbuffer); @@ -1750,7 +1776,7 @@ static int log_statement(const struct connection_info *cn, { return log_statement_ex(cn, event->general_time, event->general_thread_id, event->general_query, event->general_query_length, - event->general_error_code, type); + event->general_error_code, type, 1); } @@ -1772,7 +1798,7 @@ static int log_table(const struct connection_info *cn, ",%.*s,%.*s,",event->database_length, event->database, event->table_length, event->table); message[csize]= '\n'; - return write_log(message, csize + 1); + return write_log(message, csize + 1, 1); } @@ -1796,7 +1822,7 @@ static int log_rename(const struct connection_info *cn, event->new_database_length, event->new_database, event->new_table_length, event->new_table); message[csize]= '\n'; - return write_log(message, csize + 1); + return write_log(message, csize + 1, 1); } @@ -1988,8 +2014,6 @@ void auditing(MYSQL_THD thd, unsigned int event_class, const void *ev) if (!thd || internal_stop_logging) return; - flogger_mutex_lock(&lock_operations); - if (maria_55_started && debug_server_started && event_class == MYSQL_AUDIT_GENERAL_CLASS) { @@ -2024,7 +2048,7 @@ void auditing(MYSQL_THD thd, unsigned int event_class, const void *ev) goto exit_func; if (event_class == MYSQL_AUDIT_GENERAL_CLASS && FILTER(EVENT_QUERY) && - cn && do_log_user(cn->user)) + cn && do_log_user(cn->user, 1)) { const struct mysql_event_general *event = (const struct mysql_event_general *) ev; @@ -2043,7 +2067,7 @@ void auditing(MYSQL_THD thd, unsigned int event_class, const void *ev) { const struct mysql_event_table *event = (const struct mysql_event_table *) ev; - if (do_log_user(event->user)) + if (do_log_user(event->user, 1)) { switch (event->event_subclass) { @@ -2109,7 +2133,6 @@ exit_func: } if (cn) cn->log_always= 0; - flogger_mutex_unlock(&lock_operations); } @@ -2377,6 +2400,7 @@ static int server_audit_init(void *p __attribute__((unused))) PSI_server->register_mutex("server_audit", mutex_key_list, 1); #endif flogger_mutex_init(key_LOCK_operations, &lock_operations, MY_MUTEX_INIT_FAST); + flogger_mutex_init(key_LOCK_operations, &lock_atomic, MY_MUTEX_INIT_FAST); flogger_mutex_init(key_LOCK_operations, &lock_bigbuffer, MY_MUTEX_INIT_FAST); coll_init(&incl_user_coll); @@ -2464,6 +2488,7 @@ static int server_audit_deinit(void *p __attribute__((unused))) (void) free(big_buffer); flogger_mutex_destroy(&lock_operations); + flogger_mutex_destroy(&lock_atomic); flogger_mutex_destroy(&lock_bigbuffer); error_header(); @@ -2553,10 +2578,10 @@ static void log_current_query(MYSQL_THD thd) return; cn= get_loc_info(thd); if (!ci_needs_setup(cn) && cn->query_length && - FILTER(EVENT_QUERY) && do_log_user(cn->user)) + FILTER(EVENT_QUERY) && do_log_user(cn->user, 0)) { log_statement_ex(cn, cn->query_time, thd_get_thread_id(thd), - cn->query, cn->query_length, 0, "QUERY"); + cn->query, cn->query_length, 0, "QUERY", 0); cn->log_always= 1; } } @@ -2568,12 +2593,13 @@ static void update_file_path(MYSQL_THD thd, { char *new_name= (*(char **) save) ? *(char **) save : empty_str; - if (!maria_55_started || !debug_server_started) - flogger_mutex_lock(&lock_operations); - internal_stop_logging= 1; + ADD_ATOMIC(internal_stop_logging, 1); error_header(); fprintf(stderr, "Log file name was changed to '%s'.\n", new_name); + if (!maria_55_started || !debug_server_started) + flogger_mutex_lock(&lock_operations); + if (logging) log_current_query(thd); @@ -2582,7 +2608,6 @@ static void update_file_path(MYSQL_THD thd, char *sav_path= file_path; file_path= new_name; - internal_stop_logging= 1; stop_logging(); if (start_logging()) { @@ -2598,16 +2623,15 @@ static void update_file_path(MYSQL_THD thd, } goto exit_func; } - internal_stop_logging= 0; } strncpy(path_buffer, new_name, sizeof(path_buffer)-1); path_buffer[sizeof(path_buffer)-1]= 0; file_path= path_buffer; exit_func: - internal_stop_logging= 0; if (!maria_55_started || !debug_server_started) flogger_mutex_unlock(&lock_operations); + ADD_ATOMIC(internal_stop_logging, -1); } @@ -2692,8 +2716,8 @@ static void update_output_type(MYSQL_THD thd, if (output_type == new_output_type) return; + ADD_ATOMIC(internal_stop_logging, 1); flogger_mutex_lock(&lock_operations); - internal_stop_logging= 1; if (logging) { log_current_query(thd); @@ -2707,8 +2731,8 @@ static void update_output_type(MYSQL_THD thd, if (logging) start_logging(); - internal_stop_logging= 0; flogger_mutex_unlock(&lock_operations); + ADD_ATOMIC(internal_stop_logging, -1); } @@ -2756,9 +2780,9 @@ static void update_logging(MYSQL_THD thd, if (new_logging == logging) return; + ADD_ATOMIC(internal_stop_logging, 1); if (!maria_55_started || !debug_server_started) flogger_mutex_lock(&lock_operations); - internal_stop_logging= 1; if ((logging= new_logging)) { start_logging(); @@ -2773,9 +2797,9 @@ static void update_logging(MYSQL_THD thd, stop_logging(); } - internal_stop_logging= 0; if (!maria_55_started || !debug_server_started) flogger_mutex_unlock(&lock_operations); + ADD_ATOMIC(internal_stop_logging, -1); } @@ -2787,16 +2811,16 @@ static void update_mode(MYSQL_THD thd __attribute__((unused)), if (mode_readonly || new_mode == mode) return; + ADD_ATOMIC(internal_stop_logging, 1); if (!maria_55_started || !debug_server_started) flogger_mutex_lock(&lock_operations); - internal_stop_logging= 1; mark_always_logged(thd); error_header(); fprintf(stderr, "Logging mode was changed from %d to %d.\n", mode, new_mode); mode= new_mode; - internal_stop_logging= 0; if (!maria_55_started || !debug_server_started) flogger_mutex_unlock(&lock_operations); + ADD_ATOMIC(internal_stop_logging, -1); } diff --git a/sql/sql_audit.cc b/sql/sql_audit.cc index dd98e3cf9b1..cee0ac2287c 100644 --- a/sql/sql_audit.cc +++ b/sql/sql_audit.cc @@ -212,6 +212,7 @@ void mysql_audit_acquire_plugins(THD *thd, ulong *event_class_mask) { plugin_foreach(thd, acquire_plugins, MYSQL_AUDIT_PLUGIN, event_class_mask); add_audit_mask(thd->audit_class_mask, event_class_mask); + thd->audit_plugin_version= global_plugin_version; } DBUG_VOID_RETURN; } @@ -241,6 +242,20 @@ void mysql_audit_notify(THD *thd, uint event_class, uint event_subtype, ...) } +/** + Check if there were changes in the state of plugins + so we need to do the mysql_audit_release asap. + + @param[in] thd + +*/ + +my_bool mysql_audit_release_required(THD *thd) +{ + return thd && (thd->audit_plugin_version != global_plugin_version); +} + + /** Release any resources associated with the current thd. @@ -276,6 +291,7 @@ void mysql_audit_release(THD *thd) /* Reset the state of thread values */ reset_dynamic(&thd->audit_class_plugins); bzero(thd->audit_class_mask, sizeof(thd->audit_class_mask)); + thd->audit_plugin_version= -1; } diff --git a/sql/sql_audit.h b/sql/sql_audit.h index 550b2a50290..9a746757201 100644 --- a/sql/sql_audit.h +++ b/sql/sql_audit.h @@ -60,6 +60,7 @@ static inline void mysql_audit_notify(THD *thd, uint event_class, #define mysql_audit_connection_enabled() 0 #define mysql_audit_table_enabled() 0 #endif +extern my_bool mysql_audit_release_required(THD *thd); extern void mysql_audit_release(THD *thd); #define MAX_USER_HOST_SIZE 512 diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 8fabcd52913..6bcff6d1fca 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -776,6 +776,9 @@ THD::THD(bool is_wsrep_applier) waiting_on_group_commit(FALSE), has_waiter(FALSE), spcont(NULL), m_parser_state(NULL), +#ifndef EMBEDDED_LIBRARY + audit_plugin_version(-1), +#endif #if defined(ENABLED_DEBUG_SYNC) debug_sync_control(0), #endif /* defined(ENABLED_DEBUG_SYNC) */ @@ -1562,7 +1565,6 @@ THD::~THD() mdl_context.destroy(); ha_close_connection(this); - mysql_audit_release(this); plugin_thdvar_cleanup(this); main_security_ctx.destroy(); diff --git a/sql/sql_class.h b/sql/sql_class.h index 1cb516c0656..63923945dd5 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2978,6 +2978,7 @@ public: added to the list of audit plugins which are currently in use. */ unsigned long audit_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE]; + int audit_plugin_version; #endif #if defined(ENABLED_DEBUG_SYNC) diff --git a/sql/sql_connect.cc b/sql/sql_connect.cc index 4dbb53fa544..a6a01b140cf 100644 --- a/sql/sql_connect.cc +++ b/sql/sql_connect.cc @@ -1326,7 +1326,8 @@ void do_handle_one_connection(THD *thd_arg) while (thd_is_connection_alive(thd)) { - mysql_audit_release(thd); + if (mysql_audit_release_required(thd)) + mysql_audit_release(thd); if (do_command(thd)) break; } diff --git a/sql/sql_plugin.cc b/sql/sql_plugin.cc index 21093e3240f..48131b10951 100644 --- a/sql/sql_plugin.cc +++ b/sql/sql_plugin.cc @@ -228,6 +228,7 @@ static DYNAMIC_ARRAY plugin_array; static HASH plugin_hash[MYSQL_MAX_PLUGIN_TYPE_NUM]; static MEM_ROOT plugin_mem_root; static bool reap_needed= false; +volatile int global_plugin_version= 1; static bool initialized= 0; ulong dlopen_count; @@ -2181,6 +2182,7 @@ bool mysql_install_plugin(THD *thd, const LEX_STRING *name, reap_plugins(); } err: + global_plugin_version++; mysql_mutex_unlock(&LOCK_plugin); if (argv) free_defaults(argv); @@ -2327,6 +2329,7 @@ bool mysql_uninstall_plugin(THD *thd, const LEX_STRING *name, } reap_plugins(); + global_plugin_version++; mysql_mutex_unlock(&LOCK_plugin); DBUG_RETURN(error); diff --git a/sql/sql_plugin.h b/sql/sql_plugin.h index 7f741144f00..3bde06a992c 100644 --- a/sql/sql_plugin.h +++ b/sql/sql_plugin.h @@ -37,6 +37,7 @@ enum enum_plugin_load_option { PLUGIN_OFF, PLUGIN_ON, PLUGIN_FORCE, PLUGIN_FORCE_PLUS_PERMANENT }; extern const char *global_plugin_typelib_names[]; +extern volatile int global_plugin_version; extern ulong dlopen_count; #include diff --git a/sql/threadpool_common.cc b/sql/threadpool_common.cc index b4066bd7603..b8be7083624 100644 --- a/sql/threadpool_common.cc +++ b/sql/threadpool_common.cc @@ -266,7 +266,8 @@ int threadpool_process_request(THD *thd) { Vio *vio; thd->net.reading_or_writing= 0; - mysql_audit_release(thd); + if (mysql_audit_release_required(thd)) + mysql_audit_release(thd); if ((retval= do_command(thd)) != 0) goto end; -- cgit v1.2.1 From 91be2212c69d727d0b104f70cb32c4639397338c Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Sat, 15 Jun 2019 19:55:57 +0300 Subject: MDEV-17045: MyRocks tables cannot be updated when binlog_format=MIXED. --- storage/rocksdb/ha_rocksdb.cc | 53 ++++++++++++++++------ storage/rocksdb/ha_rocksdb.h | 26 +---------- .../rocksdb/r/mariadb_misc_binlog.result | 11 ++++- .../mysql-test/rocksdb/r/rpl_statement.result | 4 +- .../mysql-test/rocksdb/t/mariadb_misc_binlog.test | 12 ++++- .../mysql-test/rocksdb/t/rpl_statement.test | 4 +- 6 files changed, 65 insertions(+), 45 deletions(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 3331b94c62c..cb289fb0077 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -11025,6 +11025,41 @@ void ha_rocksdb::read_thd_vars(THD *const thd) { m_checksums_pct = THDVAR(thd, checksums_pct); } +ulonglong ha_rocksdb::table_flags() const +{ + DBUG_ENTER_FUNC(); + + /* + HA_BINLOG_STMT_CAPABLE + Upstream: MyRocks advertises itself as it supports SBR, but has additional + checks in ha_rocksdb::external_lock()/ start_stmt() which will return an + error if one tries to run the statement. + Exceptions: @@rocksdb_unsafe_for_binlog or we are an SQL slave thread. + + MariaDB: Inform the upper layer we don't support SBR, so it switches to RBR + if possible. The exceptions are the same as with the upstream. + + HA_REC_NOT_IN_SEQ + If we don't set it, filesort crashes, because it assumes rowids are + 1..8 byte numbers + HA_PRIMARY_KEY_IN_READ_INDEX + This flag is always set, even for tables that: + - have no PK + - have some (or all) of PK that can't be decoded from the secondary + index. + */ + THD *thd= ha_thd(); + DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | + ((thd && (THDVAR(thd, unsafe_for_binlog) ||thd->rgi_slave))? + HA_BINLOG_STMT_CAPABLE : 0) | + HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS | + HA_PRIMARY_KEY_IN_READ_INDEX | + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY | + HA_PARTIAL_COLUMN_READ | + HA_TABLE_SCAN_ON_INDEX); +} + + /** @return @@ -11037,6 +11072,9 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { DBUG_ASSERT(thd != nullptr); int res = HA_EXIT_SUCCESS; +#if 0 + // MariaDB uses a different way to implement this, see ha_rocksdb::table_flags + int binlog_format = my_core::thd_binlog_format(thd); bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog); @@ -11065,6 +11103,7 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0)); DBUG_RETURN(HA_ERR_UNSUPPORTED); } +#endif if (lock_type == F_UNLCK) { Rdb_transaction *const tx = get_tx_from_thd(thd); @@ -11167,20 +11206,6 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) { DBUG_ENTER_FUNC(); - /* - MariaDB: the following is a copy of the check in ha_rocksdb::external_lock: - */ - int binlog_format = my_core::thd_binlog_format(thd); - bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog); - if (lock_type >= TL_WRITE_ALLOW_WRITE && - !thd->rgi_slave && !unsafe_for_binlog && - binlog_format != BINLOG_FORMAT_ROW && - binlog_format != BINLOG_FORMAT_UNSPEC && - my_core::thd_binlog_filter_ok(thd)) { - my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0)); - DBUG_RETURN(HA_ERR_UNSUPPORTED); - } - DBUG_ASSERT(thd != nullptr); Rdb_transaction *const tx = get_or_create_tx(thd); diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index aae823fe4e1..a48d745fcf2 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -863,31 +863,7 @@ public: This is a list of flags that indicate what functionality the storage engine implements. The current table flags are documented in handler.h */ - ulonglong table_flags() const override { - DBUG_ENTER_FUNC(); - - /* - HA_BINLOG_STMT_CAPABLE - We are saying that this engine is just statement capable to have - an engine that can only handle statement-based logging. This is - used in testing. - HA_REC_NOT_IN_SEQ - If we don't set it, filesort crashes, because it assumes rowids are - 1..8 byte numbers - HA_PRIMARY_KEY_IN_READ_INDEX - This flag is always set, even for tables that: - - have no PK - - have some (or all) of PK that can't be decoded from the secondary - index. - */ - DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | - HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS | - HA_PRIMARY_KEY_IN_READ_INDEX | - HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY | - HA_PARTIAL_COLUMN_READ | - HA_TABLE_SCAN_ON_INDEX); - } - + ulonglong table_flags() const override ; private: bool init_with_fields(); /* no 'override' in MariaDB */ public: diff --git a/storage/rocksdb/mysql-test/rocksdb/r/mariadb_misc_binlog.result b/storage/rocksdb/mysql-test/rocksdb/r/mariadb_misc_binlog.result index e4ac62aa481..c37ab9461af 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/mariadb_misc_binlog.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/mariadb_misc_binlog.result @@ -18,7 +18,16 @@ set @tmp_bf= @@binlog_format; set binlog_format='STATEMENT'; lock tables t1 write; insert into t1 values(1); -ERROR HY000: Can't execute updates on master with binlog_format != ROW. +ERROR HY000: Cannot execute statement: impossible to write to binary log since BINLOG_FORMAT = STATEMENT and at least one table uses a storage engine limited to row-based logging. unlock tables; set @@binlog_format=@tmp_bf; drop table t1; +# +# MDEV-17045: MyRocks tables cannot be updated when binlog_format=MIXED. +# +set @tmp_bf= @@binlog_format; +set binlog_format='MIXED'; +create table t1 (pk int primary key) engine=rocksdb; +insert into t1 values (1); +drop table t1; +set @@binlog_format=@tmp_bf; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/rpl_statement.result b/storage/rocksdb/mysql-test/rocksdb/r/rpl_statement.result index cdf0c37e339..df1a60519db 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/rpl_statement.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/rpl_statement.result @@ -8,7 +8,7 @@ select @@binlog_format; STATEMENT create table t1 (pk int primary key) engine=rocksdb; insert into t1 values (1),(2),(3); -ERROR HY000: Can't execute updates on master with binlog_format != ROW. +ERROR HY000: Cannot execute statement: impossible to write to binary log since BINLOG_FORMAT = STATEMENT and at least one table uses a storage engine limited to row-based logging. set session rocksdb_unsafe_for_binlog=on; insert into t1 values (1),(2),(3); select * from t1; @@ -19,7 +19,7 @@ pk delete from t1; set session rocksdb_unsafe_for_binlog=off; insert into t1 values (1),(2),(3); -ERROR HY000: Can't execute updates on master with binlog_format != ROW. +ERROR HY000: Cannot execute statement: impossible to write to binary log since BINLOG_FORMAT = STATEMENT and at least one table uses a storage engine limited to row-based logging. set binlog_format=row; insert into t1 values (1),(2),(3); include/sync_slave_sql_with_master.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/t/mariadb_misc_binlog.test b/storage/rocksdb/mysql-test/rocksdb/t/mariadb_misc_binlog.test index fb150f7a791..e32679e88a2 100644 --- a/storage/rocksdb/mysql-test/rocksdb/t/mariadb_misc_binlog.test +++ b/storage/rocksdb/mysql-test/rocksdb/t/mariadb_misc_binlog.test @@ -21,10 +21,20 @@ unlock tables; set @tmp_bf= @@binlog_format; set binlog_format='STATEMENT'; lock tables t1 write; ---error ER_REQUIRE_ROW_BINLOG_FORMAT +--error ER_BINLOG_STMT_MODE_AND_ROW_ENGINE insert into t1 values(1); unlock tables; set @@binlog_format=@tmp_bf; drop table t1; +--echo # +--echo # MDEV-17045: MyRocks tables cannot be updated when binlog_format=MIXED. +--echo # +set @tmp_bf= @@binlog_format; +set binlog_format='MIXED'; +create table t1 (pk int primary key) engine=rocksdb; +insert into t1 values (1); +drop table t1; +set @@binlog_format=@tmp_bf; + diff --git a/storage/rocksdb/mysql-test/rocksdb/t/rpl_statement.test b/storage/rocksdb/mysql-test/rocksdb/t/rpl_statement.test index 29671308e9c..cb5f5e04b00 100644 --- a/storage/rocksdb/mysql-test/rocksdb/t/rpl_statement.test +++ b/storage/rocksdb/mysql-test/rocksdb/t/rpl_statement.test @@ -12,7 +12,7 @@ connection master; select @@binlog_format; create table t1 (pk int primary key) engine=rocksdb; ---error ER_REQUIRE_ROW_BINLOG_FORMAT +--error ER_BINLOG_STMT_MODE_AND_ROW_ENGINE insert into t1 values (1),(2),(3); set session rocksdb_unsafe_for_binlog=on; @@ -21,7 +21,7 @@ select * from t1; delete from t1; set session rocksdb_unsafe_for_binlog=off; ---error ER_REQUIRE_ROW_BINLOG_FORMAT +--error ER_BINLOG_STMT_MODE_AND_ROW_ENGINE insert into t1 values (1),(2),(3); set binlog_format=row; -- cgit v1.2.1 From 5173e396ffce706954f7302e9854126aadab4c54 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Sat, 15 Jun 2019 21:29:46 +0300 Subject: Copy of commit dcd9379eb5707bc7514a2ff4d9127790356505cb Author: Manuel Ung Date: Fri Jun 14 10:38:17 2019 -0700 Skip valgrind for rocksdb.force_shutdown Summary: This test does unclean shutdown, and leaks memory. Squash with: D15749084 Reviewed By: hermanlee Differential Revision: D15828957 fbshipit-source-id: 30541455d74 --- storage/rocksdb/.clang-format | 96 +- storage/rocksdb/CMakeLists.txt | 24 +- storage/rocksdb/event_listener.cc | 9 +- storage/rocksdb/event_listener.h | 6 +- storage/rocksdb/ha_rocksdb.cc | 3991 +++++++++++--------- storage/rocksdb/ha_rocksdb.h | 644 +--- storage/rocksdb/ha_rocksdb_proto.h | 4 +- storage/rocksdb/logger.h | 6 +- storage/rocksdb/myrocks_hotbackup | 20 +- storage/rocksdb/mysql-test/rocksdb/combinations | 1 - .../mysql-test/rocksdb/include/bulk_load.inc | 16 +- .../rocksdb/include/bulk_load_unsorted.inc | 5 +- .../rocksdb/include/bypass_create_table.inc | 298 ++ .../rocksdb/include/locking_issues_case3.inc | 5 +- .../rocksdb/include/locking_issues_case4.inc | 5 +- .../rocksdb/include/locking_issues_case5.inc | 5 +- .../rocksdb/include/locking_issues_case6.inc | 5 +- .../rocksdb/include/rocksdb_concurrent_delete.inc | 53 - .../rocksdb/include/use_direct_io_option.inc | 23 + .../r/add_index_inplace_sstfilewriter.result | 2 +- .../mysql-test/rocksdb/r/autoinc_vars.result | 21 + .../mysql-test/rocksdb/r/blind_delete_rc.result | 87 + .../mysql-test/rocksdb/r/blind_delete_rr.result | 87 + .../rocksdb/r/blind_delete_without_tx_api.result | 85 - .../mysql-test/rocksdb/r/bloomfilter3.result | 5 +- .../mysql-test/rocksdb/r/bloomfilter5.result | 25 +- .../rocksdb/mysql-test/rocksdb/r/bulk_load.result | 14 +- .../mysql-test/rocksdb/r/bulk_load_rev_cf.result | 14 +- .../rocksdb/r/bulk_load_rev_cf_and_data.result | 14 +- .../mysql-test/rocksdb/r/bulk_load_rev_data.result | 14 +- .../mysql-test/rocksdb/r/bulk_load_unsorted.result | 9 +- .../rocksdb/r/bulk_load_unsorted_rev.result | 9 +- .../rocksdb/r/bypass_select_basic.result | 693 ++++ .../rocksdb/r/bypass_select_basic_bloom.result | 693 ++++ .../mysql-test/rocksdb/r/check_flags.result | 47 + .../rocksdb/r/cons_snapshot_read_committed.result | 12 +- .../rocksdb/r/cons_snapshot_repeatable_read.result | 2 +- .../rocksdb/r/corrupted_data_reads_debug.result | 10 +- .../rocksdb/r/create_no_primary_key_table.result | 16 +- .../mysql-test/rocksdb/r/ddl_high_priority.result | 154 +- .../mysql-test/rocksdb/r/deadlock_stats.result | 3 +- .../mysql-test/rocksdb/r/delete_before_lock.result | 15 - .../mysql-test/rocksdb/r/force_shutdown.result | 38 + .../mysql-test/rocksdb/r/group_min_max.result | 3503 +++++++++++++++++ .../mysql-test/rocksdb/r/ha_extra_keyread.result | 10 + .../mysql-test/rocksdb/r/insert_with_keys.result | 199 +- .../rocksdb/mysql-test/rocksdb/r/issue255.result | 14 +- .../rocksdb/mysql-test/rocksdb/r/issue884.result | 79 + .../rocksdb/mysql-test/rocksdb/r/issue896.result | 17 + .../rocksdb/mysql-test/rocksdb/r/issue900.result | 11 + .../mysql-test/rocksdb/r/iterator_bounds.result | 15 + .../rocksdb/r/mysqlbinlog_blind_replace.result | 128 + .../rocksdb/mysql-test/rocksdb/r/mysqldump.result | 82 +- .../r/optimize_myrocks_replace_into_base.result | 98 + .../r/optimize_myrocks_replace_into_lock.result | 46 + .../rocksdb/r/prefix_extractor_override.result | 7 + .../rocksdb/mysql-test/rocksdb/r/rocksdb.result | 21 +- .../mysql-test/rocksdb/r/rocksdb_checksums.result | 2 +- .../rocksdb/r/rocksdb_concurrent_delete.result | 477 ++- .../rocksdb/r/rocksdb_read_free_rpl.result | 335 ++ .../rocksdb/r/rocksdb_read_free_rpl_stress.result | 35 + .../rocksdb/r/rocksdb_timeout_rollback.result | 72 + .../mysql-test/rocksdb/r/rpl_read_free.result | 321 -- .../rocksdb/r/rpl_row_not_found_rc.result | 56 + .../mysql-test/rocksdb/r/show_engine.result | 16 + .../mysql-test/rocksdb/r/show_table_status.result | 6 +- .../rocksdb/r/skip_core_dump_on_error.result | 31 + .../rocksdb/mysql-test/rocksdb/r/statistics.result | 18 +- .../rocksdb/r/tbl_opt_data_index_dir.result | 8 +- .../mysql-test/rocksdb/r/truncate_partition.result | 620 +++ .../mysql-test/rocksdb/r/ttl_rows_examined.result | 45 + .../mysql-test/rocksdb/r/type_decimal.result | 6 +- .../use_direct_io_for_flush_and_compaction.result | 18 + .../rocksdb/mysql-test/rocksdb/t/autoinc_vars.test | 20 + .../mysql-test/rocksdb/t/blind_delete_rc.cnf | 11 + .../mysql-test/rocksdb/t/blind_delete_rc.test | 3 + .../mysql-test/rocksdb/t/blind_delete_rr.cnf | 11 + .../mysql-test/rocksdb/t/blind_delete_rr.test | 3 + .../rocksdb/t/blind_delete_without_tx_api.cnf | 11 - .../rocksdb/t/blind_delete_without_tx_api.inc | 131 + .../rocksdb/t/blind_delete_without_tx_api.test | 129 - .../mysql-test/rocksdb/t/bloomfilter3-master.opt | 1 + .../rocksdb/mysql-test/rocksdb/t/bloomfilter3.test | 4 + .../mysql-test/rocksdb/t/bloomfilter5-master.opt | 2 +- .../rocksdb/mysql-test/rocksdb/t/bloomfilter5.test | 27 +- .../rocksdb/mysql-test/rocksdb/t/bulk_load.test | 1 + .../mysql-test/rocksdb/t/bulk_load_errors.test | 23 +- .../mysql-test/rocksdb/t/bulk_load_rev_cf.test | 1 + .../rocksdb/t/bulk_load_rev_cf_and_data.test | 1 + .../mysql-test/rocksdb/t/bulk_load_rev_data.test | 1 + .../mysql-test/rocksdb/t/bypass_select_basic.inc | 213 ++ .../mysql-test/rocksdb/t/bypass_select_basic.test | 3 + .../rocksdb/t/bypass_select_basic_bloom-master.opt | 3 + .../rocksdb/t/bypass_select_basic_bloom.test | 3 + .../rocksdb/mysql-test/rocksdb/t/check_flags.test | 117 + .../rocksdb/mysql-test/rocksdb/t/com_rpc_tx.test | 3 + .../rocksdb/t/cons_snapshot_read_committed.opt | 1 + .../rocksdb/t/cons_snapshot_repeatable_read.opt | 1 + .../rocksdb/t/cons_snapshot_serializable.opt | 1 + .../rocksdb/t/create_no_primary_key_table.test | 21 +- .../mysql-test/rocksdb/t/delete_before_lock.test | 36 - .../mysql-test/rocksdb/t/force_shutdown.test | 97 + .../mysql-test/rocksdb/t/group_min_max-master.opt | 1 + .../mysql-test/rocksdb/t/group_min_max.test | 8 + .../mysql-test/rocksdb/t/ha_extra_keyread.test | 15 + .../mysql-test/rocksdb/t/insert_with_keys.test | 104 +- storage/rocksdb/mysql-test/rocksdb/t/issue255.test | 16 +- storage/rocksdb/mysql-test/rocksdb/t/issue884.test | 43 + storage/rocksdb/mysql-test/rocksdb/t/issue896.test | 17 + storage/rocksdb/mysql-test/rocksdb/t/issue900.test | 13 + .../rocksdb/t/iterator_bounds-master.opt | 2 + .../mysql-test/rocksdb/t/iterator_bounds.test | 29 + .../rocksdb/t/level_read_uncommitted.opt | 1 + .../rocksdb/t/mysqlbinlog_blind_replace.test | 62 + .../rocksdb/mysql-test/rocksdb/t/mysqldump.test | 8 +- .../t/optimize_myrocks_replace_into_base.test | 96 + .../t/optimize_myrocks_replace_into_lock.test | 88 + .../rocksdb/t/prefix_extractor_override.test | 4 + storage/rocksdb/mysql-test/rocksdb/t/rocksdb.test | 9 +- .../rocksdb/t/rocksdb_concurrent_delete.inc | 106 + .../rocksdb/t/rocksdb_concurrent_delete.test | 52 +- .../rocksdb/t/rocksdb_concurrent_delete_main.inc | 30 + .../rocksdb/t/rocksdb_concurrent_delete_range.inc | 85 + .../rocksdb/t/rocksdb_concurrent_delete_sk.inc | 82 + .../mysql-test/rocksdb/t/rocksdb_locks.test | 5 +- .../mysql-test/rocksdb/t/rocksdb_read_free_rpl.cnf | 16 + .../rocksdb/t/rocksdb_read_free_rpl.test | 414 ++ .../rocksdb/t/rocksdb_read_free_rpl_stress.cnf | 17 + .../rocksdb/t/rocksdb_read_free_rpl_stress.inc | 69 + .../rocksdb/t/rocksdb_read_free_rpl_stress.test | 22 + .../rocksdb/t/rocksdb_timeout_rollback-master.opt | 1 + .../rocksdb/t/rocksdb_timeout_rollback.test | 78 + .../rocksdb/mysql-test/rocksdb/t/rpl_read_free.cnf | 14 - .../mysql-test/rocksdb/t/rpl_read_free.test | 302 -- .../mysql-test/rocksdb/t/rpl_row_not_found_rc.cnf | 11 + .../mysql-test/rocksdb/t/rpl_row_not_found_rc.test | 4 + .../mysql-test/rocksdb/t/rpl_row_triggers.cnf | 4 +- .../rocksdb/mysql-test/rocksdb/t/rqg_examples.test | 4 + .../rocksdb/mysql-test/rocksdb/t/rqg_runtime.test | 4 + .../mysql-test/rocksdb/t/rqg_transactions.test | 4 + .../rocksdb/t/skip_core_dump_on_error-master.opt | 1 + .../rocksdb/t/skip_core_dump_on_error.test | 53 + .../mysql-test/rocksdb/t/truncate_partition.inc | 102 + .../mysql-test/rocksdb/t/truncate_partition.test | 82 + .../mysql-test/rocksdb/t/ttl_rows_examined.test | 56 + .../rocksdb/mysql-test/rocksdb/t/unique_check.test | 3 +- .../t/use_direct_io_for_flush_and_compaction.test | 5 + .../mysql-test/rocksdb/t/use_direct_reads.test | 36 +- .../rocksdb/t/use_direct_reads_writes.test | 1 + .../rocksdb_hotbackup/include/clean_tmpfiles.sh | 8 + .../rocksdb_hotbackup/include/stream_run.sh | 30 +- .../rocksdb_hotbackup/r/xbstream_direct.result | 21 + .../mysql-test/rocksdb_hotbackup/t/xbstream.inc | 25 + .../mysql-test/rocksdb_hotbackup/t/xbstream.test | 30 +- .../rocksdb_hotbackup/t/xbstream_direct-master.opt | 1 + .../rocksdb_hotbackup/t/xbstream_direct.test | 7 + .../rocksdb/mysql-test/rocksdb_rpl/combinations | 6 +- .../rocksdb_rpl/include/rpl_gtid_crash_safe.inc | 37 + .../r/optimize_myrocks_replace_into.result | 282 ++ ...sdb_slave_check_before_image_consistency.result | 165 + .../r/rpl_gtid_crash_safe_optimized.result | 361 ++ .../r/rpl_rocksdb_slave_gtid_info_optimized.result | 43 + .../rocksdb/mysql-test/rocksdb_rpl/t/combinations | 2 - .../t/optimize_myrocks_replace_into.test | 149 + ..._slave_check_before_image_consistency-slave.opt | 1 + ...cksdb_slave_check_before_image_consistency.test | 22 + .../rocksdb_rpl/t/rpl_gtid_crash_safe.test | 39 +- .../t/rpl_gtid_crash_safe_optimized-master.opt | 1 + .../t/rpl_gtid_crash_safe_optimized-slave.opt | 2 + .../t/rpl_gtid_crash_safe_optimized.test | 11 + ...pl_rocksdb_slave_gtid_info_optimized-master.opt | 1 + ...rpl_rocksdb_slave_gtid_info_optimized-slave.opt | 1 + .../t/rpl_rocksdb_slave_gtid_info_optimized.test | 51 + .../r/rocksdb_cache_dump_basic.result | 19 + .../rocksdb_cache_high_pri_pool_ratio_basic.result | 22 + ...ndex_and_filter_with_high_priority_basic.result | 19 + ...sdb_commit_time_batch_for_recovery_basic.result | 28 +- .../r/rocksdb_delete_cf_basic.result | 6 + ..._enable_insert_with_update_caching_basic.result | 75 + .../r/rocksdb_read_free_rpl_basic.result | 58 + .../r/rocksdb_read_free_rpl_tables_basic.result | 50 +- .../r/rocksdb_rollback_on_timeout_basic.result | 97 + .../rocksdb_skip_unique_check_tables_basic.result | 2 + .../r/rocksdb_stats_level_basic.result | 85 + .../r/rocksdb_update_cf_options_basic.result | 4 + .../t/rocksdb_cache_dump_basic.test | 21 + .../t/rocksdb_cache_high_pri_pool_ratio_basic.test | 24 + ..._index_and_filter_with_high_priority_basic.test | 21 + .../t/rocksdb_delete_cf_basic-master.opt | 1 + .../t/rocksdb_delete_cf_basic.test | 75 + ...db_enable_insert_with_update_caching_basic.test | 21 + .../t/rocksdb_read_free_rpl_basic.test | 19 + .../t/rocksdb_read_free_rpl_tables_basic.test | 7 +- .../t/rocksdb_rollback_on_timeout_basic.test | 21 + .../t/rocksdb_skip_unique_check_tables_basic.test | 3 + .../t/rocksdb_stats_level_basic.test | 21 + .../t/rocksdb_update_cf_options_basic.test | 11 + storage/rocksdb/nosql_access.cc | 52 + storage/rocksdb/nosql_access.h | 35 + storage/rocksdb/properties_collector.cc | 96 +- storage/rocksdb/properties_collector.h | 53 +- storage/rocksdb/rdb_buff.h | 138 +- storage/rocksdb/rdb_cf_manager.cc | 112 +- storage/rocksdb/rdb_cf_manager.h | 12 +- storage/rocksdb/rdb_cf_options.cc | 70 +- storage/rocksdb/rdb_cf_options.h | 18 +- storage/rocksdb/rdb_compact_filter.h | 32 +- storage/rocksdb/rdb_comparator.h | 6 +- storage/rocksdb/rdb_converter.cc | 829 ++++ storage/rocksdb/rdb_converter.h | 247 ++ storage/rocksdb/rdb_datadic.cc | 1889 ++++----- storage/rocksdb/rdb_datadic.h | 518 ++- storage/rocksdb/rdb_global.h | 392 ++ storage/rocksdb/rdb_i_s.cc | 117 +- storage/rocksdb/rdb_i_s.h | 2 +- storage/rocksdb/rdb_index_merge.cc | 20 +- storage/rocksdb/rdb_index_merge.h | 15 +- storage/rocksdb/rdb_io_watchdog.cc | 11 +- storage/rocksdb/rdb_io_watchdog.h | 14 +- storage/rocksdb/rdb_mutex_wrapper.cc | 20 +- storage/rocksdb/rdb_mutex_wrapper.h | 40 +- storage/rocksdb/rdb_perf_context.cc | 24 +- storage/rocksdb/rdb_perf_context.h | 13 +- storage/rocksdb/rdb_psi.cc | 16 +- storage/rocksdb/rdb_psi.h | 3 +- storage/rocksdb/rdb_sst_info.cc | 233 +- storage/rocksdb/rdb_sst_info.h | 119 +- storage/rocksdb/rdb_threads.cc | 10 +- storage/rocksdb/rdb_threads.h | 20 +- storage/rocksdb/rdb_utils.cc | 26 +- storage/rocksdb/rdb_utils.h | 62 +- 231 files changed, 18466 insertions(+), 5443 deletions(-) create mode 100644 storage/rocksdb/mysql-test/rocksdb/include/bypass_create_table.inc delete mode 100644 storage/rocksdb/mysql-test/rocksdb/include/rocksdb_concurrent_delete.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/include/use_direct_io_option.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rc.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rr.result delete mode 100644 storage/rocksdb/mysql-test/rocksdb/r/blind_delete_without_tx_api.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic_bloom.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/check_flags.result delete mode 100644 storage/rocksdb/mysql-test/rocksdb/r/delete_before_lock.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/force_shutdown.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/group_min_max.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ha_extra_keyread.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/issue884.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/issue896.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/issue900.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/iterator_bounds.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/mysqlbinlog_blind_replace.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/optimize_myrocks_replace_into_base.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/optimize_myrocks_replace_into_lock.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/rocksdb_read_free_rpl.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/rocksdb_read_free_rpl_stress.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/rocksdb_timeout_rollback.result delete mode 100644 storage/rocksdb/mysql-test/rocksdb/r/rpl_read_free.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/rpl_row_not_found_rc.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/skip_core_dump_on_error.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/truncate_partition.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ttl_rows_examined.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/use_direct_io_for_flush_and_compaction.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_rc.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_rc.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_rr.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_rr.test delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_without_tx_api.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_without_tx_api.inc delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/blind_delete_without_tx_api.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bypass_select_basic.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bypass_select_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bypass_select_basic_bloom-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bypass_select_basic_bloom.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/check_flags.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/cons_snapshot_read_committed.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/cons_snapshot_repeatable_read.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/cons_snapshot_serializable.opt delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/delete_before_lock.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/force_shutdown.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/group_min_max-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/group_min_max.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ha_extra_keyread.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/issue884.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/issue896.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/issue900.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/iterator_bounds-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/iterator_bounds.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/level_read_uncommitted.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/mysqlbinlog_blind_replace.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/optimize_myrocks_replace_into_base.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/optimize_myrocks_replace_into_lock.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_concurrent_delete.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_concurrent_delete_main.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_concurrent_delete_range.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_concurrent_delete_sk.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_read_free_rpl.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_read_free_rpl.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_read_free_rpl_stress.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_read_free_rpl_stress.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_read_free_rpl_stress.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_timeout_rollback-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rocksdb_timeout_rollback.test delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rpl_read_free.cnf delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rpl_read_free.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rpl_row_not_found_rc.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/rpl_row_not_found_rc.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/skip_core_dump_on_error-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/skip_core_dump_on_error.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/truncate_partition.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/truncate_partition.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_rows_examined.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/use_direct_io_for_flush_and_compaction.test create mode 100755 storage/rocksdb/mysql-test/rocksdb_hotbackup/include/clean_tmpfiles.sh create mode 100644 storage/rocksdb/mysql-test/rocksdb_hotbackup/r/xbstream_direct.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_hotbackup/t/xbstream.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb_hotbackup/t/xbstream_direct-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_hotbackup/t/xbstream_direct.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/include/rpl_gtid_crash_safe.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/optimize_myrocks_replace_into.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rocksdb_slave_check_before_image_consistency.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_gtid_crash_safe_optimized.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_rocksdb_slave_gtid_info_optimized.result delete mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/combinations create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/optimize_myrocks_replace_into.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rocksdb_slave_check_before_image_consistency-slave.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rocksdb_slave_check_before_image_consistency.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_gtid_crash_safe_optimized-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_gtid_crash_safe_optimized-slave.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_gtid_crash_safe_optimized.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_rocksdb_slave_gtid_info_optimized-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_rocksdb_slave_gtid_info_optimized-slave.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_rocksdb_slave_gtid_info_optimized.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_cache_dump_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_cache_high_pri_pool_ratio_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_cache_index_and_filter_with_high_priority_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_delete_cf_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_enable_insert_with_update_caching_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_read_free_rpl_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_rollback_on_timeout_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_cache_dump_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_cache_high_pri_pool_ratio_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_cache_index_and_filter_with_high_priority_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_delete_cf_basic-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_delete_cf_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_enable_insert_with_update_caching_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_read_free_rpl_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_rollback_on_timeout_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_stats_level_basic.test create mode 100644 storage/rocksdb/nosql_access.cc create mode 100644 storage/rocksdb/nosql_access.h create mode 100644 storage/rocksdb/rdb_converter.cc create mode 100644 storage/rocksdb/rdb_converter.h create mode 100644 storage/rocksdb/rdb_global.h diff --git a/storage/rocksdb/.clang-format b/storage/rocksdb/.clang-format index d80b012dd4b..b1df76bdf2d 100644 --- a/storage/rocksdb/.clang-format +++ b/storage/rocksdb/.clang-format @@ -1,23 +1,49 @@ ---- +# Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License, version 2.0, +# as published by the Free Software Foundation. +# +# This program is also distributed with certain software (including +# but not limited to OpenSSL) that is licensed under separate terms, +# as designated in a particular file or component or in included license +# documentation. The authors of MySQL hereby grant you an additional +# permission to link the program and your derivative works with the +# separately licensed software that they have included with MySQL. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License, version 2.0, for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +# This is the output of clang-format-5.0 --style=google --dump-config, +# except for changes mentioned below. We lock the style so that any newer +# version of clang-format will give the same result; as time goes, we may +# update this list, requiring newer versions of clang-format. + Language: Cpp -# BasedOnStyle: LLVM -AccessModifierOffset: -2 +# BasedOnStyle: Google +AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false -AlignEscapedNewlinesLeft: false +AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true BinPackArguments: true BinPackParameters: true BraceWrapping: @@ -32,62 +58,80 @@ BraceWrapping: BeforeCatch: false BeforeElse: false IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' -ConstructorInitializerAllOnOneLineOrOnePerLine: false +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true -DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' Priority: 2 - - Regex: '^(<|"(gtest|isl|json)/)' - Priority: 3 - Regex: '.*' - Priority: 1 -IncludeIsMainRegex: '$' -IndentCaseLabels: false + Priority: 3 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false -KeepEmptyLinesAtTheStartOfBlocks: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakBeforeFirstCallParameter: 19 +ObjCSpaceBeforeProtocolList: false +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 -PointerAlignment: Right +PenaltyReturnTypeOnItsOwnLine: 200 ReflowComments: true SortIncludes: true +SortUsingDeclarations: true SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 +SpacesBeforeTrailingComments: 2 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -Standard: Cpp11 TabWidth: 8 UseTab: Never -JavaScriptQuotes: Leave -... + +# We declare one specific pointer style since right alignment is dominant in +# the MySQL code base (default --style=google has DerivePointerAlignment true). +DerivePointerAlignment: false +PointerAlignment: Right + +# MySQL source code is allowed to use C++11 features. +Standard: Cpp11 diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index 54b0b1fab6a..1facdf1f148 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -60,6 +60,7 @@ SET(ROCKSDB_SOURCES rdb_datadic.cc rdb_datadic.h rdb_cf_options.cc rdb_cf_options.h rdb_cf_manager.cc rdb_cf_manager.h + rdb_converter.cc rdb_converter.h properties_collector.cc properties_collector.h event_listener.cc event_listener.h rdb_i_s.cc rdb_i_s.h @@ -71,6 +72,7 @@ SET(ROCKSDB_SOURCES rdb_sst_info.cc rdb_sst_info.h rdb_utils.cc rdb_utils.h rdb_buff.h rdb_threads.cc rdb_threads.h + nosql_access.cc nosql_access.h ${ROCKSDB_LIB_SOURCES} ) @@ -81,6 +83,25 @@ ELSE() ENDIF() SET(rocksdb_static_libs ) + +IF (WITH_JEMALLOC) + FIND_LIBRARY(JEMALLOC_LIBRARY + NAMES libjemalloc${PIC_EXT}.a jemalloc + HINTS ${WITH_JEMALLOC}/lib) + SET(rocksdb_static_libs ${rocksdb_static_libs} + ${JEMALLOC_LIBRARY}) + ADD_DEFINITIONS(-DROCKSDB_JEMALLOC) + ADD_DEFINITIONS(-DROCKSDB_MALLOC_USABLE_SIZE) +ENDIF() + +IF (WITH_UNWIND) + FIND_LIBRARY(UNWIND_LIBRARY + NAMES libunwind${PIC_EXT}.a unwind + HINTS ${WITH_UNWIND}/lib) + SET(rocksdb_static_libs ${rocksdb_static_libs} + ${UNWIND_LIBRARY}) +ENDIF() + IF (WITH_SNAPPY) FIND_LIBRARY(SNAPPY_LIBRARY NAMES libsnappy${PIC_EXT}.a snappy @@ -111,6 +132,7 @@ ENDIF() IF (WITH_ZSTD) SET(rocksdb_static_libs ${rocksdb_static_libs} ${ZSTD_LIBRARY}) ADD_DEFINITIONS(-DZSTD) + ADD_DEFINITIONS(-DZSTD_STATIC_LINKING_ONLY) ENDIF() IF (WITH_TBB) @@ -129,7 +151,7 @@ IF (HAVE_ZLIB_H) SET(rocksdb_static_libs ${rocksdb_static_libs} ${ZLIB_LIBRARY}) ENDIF() -SET(rocksdb_static_libs ${rocksdb_static_libs} "-lrt") +SET(rocksdb_static_libs ${rocksdb_static_libs} "-lrt" "-ldl" "-lpthread") MYSQL_ADD_PLUGIN(rocksdb_se ${ROCKSDB_SOURCES} STORAGE_ENGINE DEFAULT STATIC_ONLY LINK_LIBRARIES ${rocksdb_static_libs} diff --git a/storage/rocksdb/event_listener.cc b/storage/rocksdb/event_listener.cc index 2da77a16f68..56be8719de8 100644 --- a/storage/rocksdb/event_listener.cc +++ b/storage/rocksdb/event_listener.cc @@ -32,9 +32,9 @@ namespace myrocks { -static std::vector -extract_index_stats(const std::vector &files, - const rocksdb::TablePropertiesCollection &props) { +static std::vector extract_index_stats( + const std::vector &files, + const rocksdb::TablePropertiesCollection &props) { std::vector ret; for (auto fn : files) { const auto it = props.find(fn); @@ -85,10 +85,11 @@ void Rdb_event_listener::OnExternalFileIngested( void Rdb_event_listener::OnBackgroundError( rocksdb::BackgroundErrorReason reason, rocksdb::Status *status) { rdb_log_status_error(*status, "Error detected in background"); + // NO_LINT_DEBUG sql_print_error("RocksDB: BackgroundErrorReason: %d", (int)reason); if (status->IsCorruption()) { rdb_persist_corruption_marker(); abort(); } } -} // namespace myrocks +} // namespace myrocks diff --git a/storage/rocksdb/event_listener.h b/storage/rocksdb/event_listener.h index 8772105de36..7daba433c17 100644 --- a/storage/rocksdb/event_listener.h +++ b/storage/rocksdb/event_listener.h @@ -22,7 +22,7 @@ namespace myrocks { class Rdb_ddl_manager; class Rdb_event_listener : public rocksdb::EventListener { -public: + public: Rdb_event_listener(const Rdb_event_listener &) = delete; Rdb_event_listener &operator=(const Rdb_event_listener &) = delete; @@ -40,10 +40,10 @@ public: void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status *status) override; -private: + private: Rdb_ddl_manager *m_ddl_manager; void update_index_stats(const rocksdb::TableProperties &props); }; -} // namespace myrocks +} // namespace myrocks diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index c7d7cc0bdc4..cb7e24b162f 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -15,7 +15,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifdef USE_PRAGMA_IMPLEMENTATION -#pragma implementation // gcc: Class implementation +#pragma implementation // gcc: Class implementation #endif #define MYSQL_SERVER 1 @@ -24,8 +24,8 @@ #include "./ha_rocksdb.h" /* C++ standard header files */ -#include #include +#include #include #include #include @@ -34,24 +34,21 @@ #include /* MySQL includes */ +#include +#include +#include #include "./debug_sync.h" #include "./my_bit.h" #include "./my_stacktrace.h" #include "./my_sys.h" #include "./sql_audit.h" #include "./sql_table.h" -#include -#include -#include - -// Both MySQL and RocksDB define the same constant. To avoid compilation errors -// till we make the fix in RocksDB, we'll temporary undefine it here. -#undef CACHE_LINE_SIZE /* RocksDB includes */ #include "monitoring/histogram.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" +#include "rocksdb/memory_allocator.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" @@ -60,14 +57,17 @@ #include "rocksdb/utilities/convenience.h" #include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/sim_cache.h" +#include "rocksdb/utilities/write_batch_with_index.h" #include "util/stop_watch.h" /* MyRocks includes */ #include "./event_listener.h" #include "./ha_rocksdb_proto.h" #include "./logger.h" +#include "./nosql_access.h" #include "./rdb_cf_manager.h" #include "./rdb_cf_options.h" +#include "./rdb_converter.h" #include "./rdb_datadic.h" #include "./rdb_i_s.h" #include "./rdb_index_merge.h" @@ -88,17 +88,19 @@ void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all); * Get the user thread's binary logging format * @param thd user thread * @return Value to be used as index into the binlog_format_names array -*/ + */ int thd_binlog_format(const MYSQL_THD thd); /** * Check if binary logging is filtered for thread's current db. * @param thd Thread handle * @retval 1 the query is not filtered, 0 otherwise. -*/ + */ bool thd_binlog_filter_ok(const MYSQL_THD thd); } +extern my_bool opt_core_file; + namespace myrocks { static st_global_stats global_stats; @@ -110,21 +112,13 @@ const std::string DEFAULT_CF_NAME("default"); const std::string DEFAULT_SYSTEM_CF_NAME("__system__"); const std::string PER_INDEX_CF_NAME("$per_index_cf"); -class Rdb_explicit_snapshot; - -std::mutex explicit_snapshot_mutex; -ulonglong explicit_snapshot_counter = 0; -std::unordered_map> - explicit_snapshots; static std::vector rdb_indexes_to_recalc; class Rdb_explicit_snapshot : public explicit_snapshot { - std::unique_ptr snapshot; - public: - static std::shared_ptr - create(snapshot_info_st *ss_info, rocksdb::DB *db, - const rocksdb::Snapshot *snapshot) { + static std::shared_ptr create( + snapshot_info_st *ss_info, rocksdb::DB *db, + const rocksdb::Snapshot *snapshot) { std::lock_guard lock(explicit_snapshot_mutex); auto s = std::unique_ptr( new rocksdb::ManagedSnapshot(db, snapshot)); @@ -140,8 +134,24 @@ class Rdb_explicit_snapshot : public explicit_snapshot { return ret; } - static std::shared_ptr - get(const ulonglong snapshot_id) { + static std::string dump_snapshots() { + std::string str; + std::lock_guard lock(explicit_snapshot_mutex); + for (const auto &elem : explicit_snapshots) { + const auto &ss = elem.second.lock(); + DBUG_ASSERT(ss != nullptr); + const auto &info = ss->ss_info; + str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) + + "\nBinlog File: " + info.binlog_file + + "\nBinlog Pos: " + std::to_string(info.binlog_pos) + + "\nGtid Executed: " + info.gtid_executed + "\n"; + } + + return str; + } + + static std::shared_ptr get( + const ulonglong snapshot_id) { std::lock_guard lock(explicit_snapshot_mutex); auto elem = explicit_snapshots.find(snapshot_id); if (elem == explicit_snapshots.end()) { @@ -153,25 +163,39 @@ class Rdb_explicit_snapshot : public explicit_snapshot { rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); } Rdb_explicit_snapshot(snapshot_info_st ss_info, - std::unique_ptr snapshot) + std::unique_ptr &&snapshot) : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {} virtual ~Rdb_explicit_snapshot() { std::lock_guard lock(explicit_snapshot_mutex); explicit_snapshots.erase(ss_info.snapshot_id); } + + private: + std::unique_ptr snapshot; + + static std::mutex explicit_snapshot_mutex; + static ulonglong explicit_snapshot_counter; + static std::unordered_map> + explicit_snapshots; }; +std::mutex Rdb_explicit_snapshot::explicit_snapshot_mutex; +ulonglong Rdb_explicit_snapshot::explicit_snapshot_counter = 0; +std::unordered_map> + Rdb_explicit_snapshot::explicit_snapshots; + /** Updates row counters based on the table type and operation type. */ void ha_rocksdb::update_row_stats(const operation_type &type) { DBUG_ASSERT(type < ROWS_MAX); // Find if we are modifying system databases. - if (table->s && m_tbl_def->m_is_mysql_system_table) + if (table->s && m_tbl_def->m_is_mysql_system_table) { global_stats.system_rows[type].inc(); - else + } else { global_stats.rows[type].inc(); + } } void dbug_dump_database(rocksdb::DB *db); @@ -179,8 +203,8 @@ static handler *rocksdb_create_handler(my_core::handlerton *hton, my_core::TABLE_SHARE *table_arg, my_core::MEM_ROOT *mem_root); -static rocksdb::CompactRangeOptions -getCompactRangeOptions(int concurrency = 0) { +static rocksdb::CompactRangeOptions getCompactRangeOptions( + int concurrency = 0) { rocksdb::CompactRangeOptions compact_range_options; compact_range_options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce; @@ -239,32 +263,69 @@ static void rocksdb_flush_all_memtables() { } } +static void rocksdb_delete_column_family_stub( + THD *const /* thd */, struct st_mysql_sys_var *const /* var */, + void *const /* var_ptr */, const void *const /* save */) {} + +static int rocksdb_delete_column_family( + THD *const /* thd */, struct st_mysql_sys_var *const /* var */, + void *const /* var_ptr */, struct st_mysql_value *const value) { + // Return failure for now until the race condition between creating + // CF and deleting CF is resolved + return HA_EXIT_FAILURE; + + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + DBUG_ASSERT(value != nullptr); + + if (const char *const cf = value->val_str(value, buff, &len)) { + auto &cf_manager = rdb_get_cf_manager(); + auto ret = cf_manager.drop_cf(cf); + if (ret == HA_EXIT_SUCCESS) { + // NO_LINT_DEBUG + sql_print_information("RocksDB: Dropped column family: %s\n", cf); + } else { + // NO_LINT_DEBUG + sql_print_error("RocksDB: Failed to drop column family: %s, error: %d\n", + cf, ret); + } + + return ret; + } + + return HA_EXIT_SUCCESS; +} + /////////////////////////////////////////////////////////// // Hash map: table name => open table handler /////////////////////////////////////////////////////////// -namespace // anonymous namespace = not visible outside this source file +namespace // anonymous namespace = not visible outside this source file { const ulong TABLE_HASH_SIZE = 32; -struct Rdb_open_tables_map { +class Rdb_open_tables_map { + private: /* Hash table used to track the handlers of open tables */ - my_core::HASH m_hash; + std::unordered_map m_table_map; + /* The mutex used to protect the hash table */ mutable mysql_mutex_t m_mutex; - void init_hash(void) { - (void)my_hash_init(&m_hash, my_core::system_charset_info, TABLE_HASH_SIZE, - 0, 0, (my_hash_get_key)Rdb_open_tables_map::get_hash_key, - 0, 0); + public: + void init() { + m_table_map.clear(); + mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &m_mutex, MY_MUTEX_INIT_FAST); } - void free_hash(void) { my_hash_free(&m_hash); } + void free() { + m_table_map.clear(); + mysql_mutex_destroy(&m_mutex); + } - static uchar *get_hash_key(Rdb_table_handler *const table_handler, - size_t *const length, - my_bool not_used MY_ATTRIBUTE((__unused__))); + size_t count() { return m_table_map.size(); } Rdb_table_handler *get_table_handler(const char *const table_name); void release_table_handler(Rdb_table_handler *const table_handler); @@ -272,7 +333,7 @@ struct Rdb_open_tables_map { std::vector get_table_names(void) const; }; -} // anonymous namespace +} // anonymous namespace static Rdb_open_tables_map rdb_open_tables; @@ -307,6 +368,7 @@ static int rocksdb_create_checkpoint( status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str()); delete checkpoint; if (status.ok()) { + // NO_LINT_DEBUG sql_print_information( "RocksDB: created checkpoint in directory : %s\n", checkpoint_dir.c_str()); @@ -336,6 +398,7 @@ static void rocksdb_force_flush_memtable_now_stub( static int rocksdb_force_flush_memtable_now( THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, struct st_mysql_value *const value) { + // NO_LINT_DEBUG sql_print_information("RocksDB: Manual memtable flush."); rocksdb_flush_all_memtables(); return HA_EXIT_SUCCESS; @@ -348,6 +411,7 @@ static void rocksdb_force_flush_memtable_and_lzero_now_stub( static int rocksdb_force_flush_memtable_and_lzero_now( THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, struct st_mysql_value *const value) { + // NO_LINT_DEBUG sql_print_information("RocksDB: Manual memtable and L0 flush."); rocksdb_flush_all_memtables(); @@ -356,29 +420,46 @@ static int rocksdb_force_flush_memtable_and_lzero_now( rocksdb::ColumnFamilyMetaData metadata; rocksdb::ColumnFamilyDescriptor cf_descr; + int i, max_attempts = 3, num_errors = 0; + for (const auto &cf_handle : cf_manager.get_all_cf()) { - rdb->GetColumnFamilyMetaData(cf_handle, &metadata); - cf_handle->GetDescriptor(&cf_descr); - c_options.output_file_size_limit = cf_descr.options.target_file_size_base; + for (i = 0; i < max_attempts; i++) { + rdb->GetColumnFamilyMetaData(cf_handle, &metadata); + cf_handle->GetDescriptor(&cf_descr); + c_options.output_file_size_limit = cf_descr.options.target_file_size_base; + + DBUG_ASSERT(metadata.levels[0].level == 0); + std::vector file_names; + for (auto &file : metadata.levels[0].files) { + file_names.emplace_back(file.db_path + file.name); + } - DBUG_ASSERT(metadata.levels[0].level == 0); - std::vector file_names; - for (auto &file : metadata.levels[0].files) { - file_names.emplace_back(file.db_path + file.name); - } + if (file_names.empty()) { + break; + } - if (!file_names.empty()) { rocksdb::Status s; s = rdb->CompactFiles(c_options, cf_handle, file_names, 1); + // Due to a race, it's possible for CompactFiles to collide + // with auto compaction, causing an error to return + // regarding file not found. In that case, retry. + if (s.IsInvalidArgument()) { + continue; + } + if (!s.ok() && !s.IsAborted()) { rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL); return HA_EXIT_FAILURE; } + break; + } + if (i == max_attempts) { + num_errors++; } } - return HA_EXIT_SUCCESS; + return num_errors == 0 ? HA_EXIT_SUCCESS : HA_EXIT_FAILURE; } static void rocksdb_drop_index_wakeup_thread( @@ -449,11 +530,9 @@ static void rocksdb_set_update_cf_options(THD *thd, struct st_mysql_sys_var *var, void *var_ptr, const void *save); -static int rocksdb_check_bulk_load(THD *const thd, - struct st_mysql_sys_var *var - MY_ATTRIBUTE((__unused__)), - void *save, - struct st_mysql_value *value); +static int rocksdb_check_bulk_load( + THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), + void *save, struct st_mysql_value *value); static int rocksdb_check_bulk_load_allow_unsorted( THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), @@ -480,6 +559,8 @@ static int rocksdb_validate_set_block_cache_size( static long long rocksdb_block_cache_size; static long long rocksdb_sim_cache_size; static my_bool rocksdb_use_clock_cache; +static double rocksdb_cache_high_pri_pool_ratio; +static my_bool rocksdb_cache_dump; /* Use unsigned long long instead of uint64_t because of MySQL compatibility */ static unsigned long long // NOLINT(runtime/int) rocksdb_rate_limiter_bytes_per_sec; @@ -499,8 +580,10 @@ static my_bool rocksdb_force_compute_memtable_stats; static uint32_t rocksdb_force_compute_memtable_stats_cachetime; static my_bool rocksdb_debug_optimizer_no_zero_cardinality; static uint32_t rocksdb_wal_recovery_mode; +static uint32_t rocksdb_stats_level; static uint32_t rocksdb_access_hint_on_compaction_start; static char *rocksdb_compact_cf_name; +static char *rocksdb_delete_cf_name; static char *rocksdb_checkpoint_name; static my_bool rocksdb_signal_drop_index_thread; static my_bool rocksdb_strict_collation_check = 1; @@ -531,10 +614,21 @@ static my_bool rocksdb_large_prefix = 0; static my_bool rocksdb_allow_to_start_after_corruption = 0; static uint64_t rocksdb_write_policy = rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; +char *rocksdb_read_free_rpl_tables; +std::mutex rocksdb_read_free_rpl_tables_mutex; +#if defined(HAVE_PSI_INTERFACE) +Regex_list_handler rdb_read_free_regex_handler(key_rwlock_read_free_rpl_tables); +#else +Regex_list_handler rdb_read_free_regex_handler; +#endif +enum read_free_rpl_type { OFF = 0, PK_ONLY, PK_SK }; +static uint64_t rocksdb_read_free_rpl = read_free_rpl_type::OFF; static my_bool rocksdb_error_on_suboptimal_collation = 1; static uint32_t rocksdb_stats_recalc_rate = 0; static uint32_t rocksdb_debug_manual_compaction_delay = 0; static uint32_t rocksdb_max_manual_compactions = 0; +static my_bool rocksdb_rollback_on_timeout = FALSE; +static my_bool rocksdb_enable_insert_with_update_caching = TRUE; std::atomic rocksdb_row_lock_deadlocks(0); std::atomic rocksdb_row_lock_wait_timeouts(0); @@ -542,6 +636,9 @@ std::atomic rocksdb_snapshot_conflict_errors(0); std::atomic rocksdb_wal_group_syncs(0); std::atomic rocksdb_manual_compactions_processed(0); std::atomic rocksdb_manual_compactions_running(0); +#ifndef DBUG_OFF +std::atomic rocksdb_num_get_for_update_calls(0); +#endif static std::unique_ptr rdb_init_rocksdb_db_options(void) { auto o = std::unique_ptr(new rocksdb::DBOptions()); @@ -550,7 +647,7 @@ static std::unique_ptr rdb_init_rocksdb_db_options(void) { o->listeners.push_back(std::make_shared(&ddl_manager)); o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL; o->max_subcompactions = DEFAULT_SUBCOMPACTIONS; - o->max_open_files = -2; // auto-tune to 50% open_files_limit + o->max_open_files = -2; // auto-tune to 50% open_files_limit o->two_write_queues = true; o->manual_wal_flush = true; @@ -574,6 +671,13 @@ static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1, "write_policy_typelib", write_policy_names, nullptr}; +/* This array needs to be kept up to date with myrocks::read_free_rpl_type */ +static const char *read_free_rpl_names[] = {"OFF", "PK_ONLY", "PK_SK", NullS}; + +static TYPELIB read_free_rpl_typelib = {array_elements(read_free_rpl_names) - 1, + "read_free_rpl_typelib", + read_free_rpl_names, nullptr}; + /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */ static const char *info_log_level_names[] = {"debug_level", "info_level", "warn_level", "error_level", @@ -595,6 +699,23 @@ static void rocksdb_set_rocksdb_info_log_level( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +static void rocksdb_set_rocksdb_stats_level(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + const void *const save) { + DBUG_ASSERT(save != nullptr); + + RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); + rocksdb_db_options->statistics->set_stats_level( + static_cast( + *static_cast(save))); + // Actual stats level is defined at rocksdb dbopt::statistics::stats_level_ + // so adjusting rocksdb_stats_level here to make sure it points to + // the correct stats level. + rocksdb_stats_level = rocksdb_db_options->statistics->get_stats_level(); + RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); +} + static void rocksdb_set_reset_stats( my_core::THD *const /* unused */, my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), @@ -716,7 +837,7 @@ static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, static MYSQL_THDVAR_BOOL( commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG, "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr, - nullptr, FALSE); + nullptr, TRUE); static MYSQL_THDVAR_BOOL( trace_sst_api, PLUGIN_VAR_RQCMDARG, @@ -751,10 +872,11 @@ static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC, "Directory for temporary files during DDL operations.", nullptr, nullptr, ""); +#define DEFAULT_SKIP_UNIQUE_CHECK_TABLES ".*" static MYSQL_THDVAR_STR( skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, "Skip unique constraint checking for the specified tables", nullptr, - nullptr, ".*"); + nullptr, DEFAULT_SKIP_UNIQUE_CHECK_TABLES); static MYSQL_THDVAR_BOOL( commit_in_the_middle, PLUGIN_VAR_RQCMDARG, @@ -768,11 +890,80 @@ static MYSQL_THDVAR_BOOL( " Blind delete is disabled if the table has secondary key", nullptr, nullptr, FALSE); -static MYSQL_THDVAR_STR( - read_free_rpl_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, +static const char *DEFAULT_READ_FREE_RPL_TABLES = ".*"; + +static int rocksdb_validate_read_free_rpl_tables( + THD *thd MY_ATTRIBUTE((__unused__)), + struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *save, + struct st_mysql_value *value) { + char buff[STRING_BUFFER_USUAL_SIZE]; + int length = sizeof(buff); + const char *wlist_buf = value->val_str(value, buff, &length); + const auto wlist = wlist_buf ? wlist_buf : DEFAULT_READ_FREE_RPL_TABLES; + +#if defined(HAVE_PSI_INTERFACE) + Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables); +#else + Regex_list_handler regex_handler; +#endif + + if (!regex_handler.set_patterns(wlist)) { + warn_about_bad_patterns(®ex_handler, "rocksdb_read_free_rpl_tables"); + return HA_EXIT_FAILURE; + } + + *static_cast(save) = my_strdup(wlist, MYF(MY_WME)); + return HA_EXIT_SUCCESS; +} + +static void rocksdb_update_read_free_rpl_tables( + THD *thd MY_ATTRIBUTE((__unused__)), + struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr, + const void *save) { + const auto wlist = *static_cast(save); + DBUG_ASSERT(wlist != nullptr); + + // This is bound to succeed since we've already checked for bad patterns in + // rocksdb_validate_read_free_rpl_tables + rdb_read_free_regex_handler.set_patterns(wlist); + + // update all table defs + struct Rdb_read_free_rpl_updater : public Rdb_tables_scanner { + int add_table(Rdb_tbl_def *tdef) override { + tdef->check_and_set_read_free_rpl_table(); + return HA_EXIT_SUCCESS; + } + } updater; + ddl_manager.scan_for_tables(&updater); + + if (wlist == DEFAULT_READ_FREE_RPL_TABLES) { + // If running SET var = DEFAULT, then rocksdb_validate_read_free_rpl_tables + // isn't called, and memory is never allocated for the value. Allocate it + // here. + *static_cast(var_ptr) = my_strdup(wlist, MYF(MY_WME)); + } else { + // Otherwise, we just reuse the value allocated from + // rocksdb_validate_read_free_rpl_tables. + *static_cast(var_ptr) = wlist; + } +} + +static MYSQL_SYSVAR_STR( + read_free_rpl_tables, rocksdb_read_free_rpl_tables, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC | PLUGIN_VAR_ALLOCATED, "List of tables that will use read-free replication on the slave " "(i.e. not lookup a row during replication)", - nullptr, nullptr, ""); + rocksdb_validate_read_free_rpl_tables, rocksdb_update_read_free_rpl_tables, + DEFAULT_READ_FREE_RPL_TABLES); + +static MYSQL_SYSVAR_ENUM( + read_free_rpl, rocksdb_read_free_rpl, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "Use read-free replication on the slave (i.e. no row lookup during " + "replication). Default is OFF, PK_SK will enable it on all tables with " + "primary key. PK_ONLY will enable it on tables where the only key is the " + "primary key (i.e. no secondary keys).", + nullptr, nullptr, read_free_rpl_type::OFF, &read_free_rpl_typelib); static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG, "Skip using bloom filter for reads", nullptr, nullptr, @@ -940,6 +1131,14 @@ static MYSQL_SYSVAR_UINT( /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords, /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0); +static MYSQL_SYSVAR_UINT( + stats_level, rocksdb_stats_level, PLUGIN_VAR_RQCMDARG, + "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)", + nullptr, rocksdb_set_rocksdb_stats_level, + /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers, + /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers, + /* max */ (uint)rocksdb::StatsLevel::kAll, 0); + static MYSQL_SYSVAR_ULONG(compaction_readahead_size, rocksdb_db_options->compaction_readahead_size, PLUGIN_VAR_RQCMDARG, @@ -1014,7 +1213,8 @@ static MYSQL_SYSVAR_ULONG( persistent_cache_size_mb, rocksdb_persistent_cache_size_mb, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Size of cache in MB for BlockBasedTableOptions::persistent_cache " - "for RocksDB", nullptr, nullptr, rocksdb_persistent_cache_size_mb, + "for RocksDB", + nullptr, nullptr, rocksdb_persistent_cache_size_mb, /* min */ 0L, /* max */ ULONG_MAX, 0); static MYSQL_SYSVAR_ULONG( @@ -1194,7 +1394,7 @@ static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, rocksdb_validate_set_block_cache_size, nullptr, /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE, /* min */ RDB_MIN_BLOCK_CACHE_SIZE, - /* max */ LONGLONG_MAX, + /* max */ LLONG_MAX, /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE); static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size, @@ -1203,15 +1403,26 @@ static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size, nullptr, /* default */ 0, /* min */ 0, - /* max */ LONGLONG_MAX, + /* max */ LLONG_MAX, /* Block size */ 0); static MYSQL_SYSVAR_BOOL( - use_clock_cache, - rocksdb_use_clock_cache, + use_clock_cache, rocksdb_use_clock_cache, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Use ClockCache instead of default LRUCache for RocksDB", - nullptr, nullptr, false); + "Use ClockCache instead of default LRUCache for RocksDB", nullptr, nullptr, + false); + +static MYSQL_SYSVAR_BOOL(cache_dump, rocksdb_cache_dump, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Include RocksDB block cache content in core dump.", + nullptr, nullptr, true); + +static MYSQL_SYSVAR_DOUBLE(cache_high_pri_pool_ratio, + rocksdb_cache_high_pri_pool_ratio, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Specify the size of block cache high-pri pool", + nullptr, nullptr, /* default */ 0.0, /* min */ 0.0, + /* max */ 1.0, 0); static MYSQL_SYSVAR_BOOL( cache_index_and_filter_blocks, @@ -1221,6 +1432,14 @@ static MYSQL_SYSVAR_BOOL( "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB", nullptr, nullptr, true); +static MYSQL_SYSVAR_BOOL( + cache_index_and_filter_with_high_priority, + *reinterpret_cast( + &rocksdb_tbl_options->cache_index_and_filter_blocks_with_high_priority), + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "cache_index_and_filter_blocks_with_high_priority for RocksDB", nullptr, + nullptr, true); + // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the // LRU cache, but will always keep the filter & idndex block's handle checked // out (=won't call ShardedLRUCache::Release), plus the parsed out objects @@ -1349,10 +1568,10 @@ static MYSQL_SYSVAR_UINT( nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0); static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats, - rocksdb_force_compute_memtable_stats, - PLUGIN_VAR_RQCMDARG, - "Force to always compute memtable stats", - nullptr, nullptr, TRUE); + rocksdb_force_compute_memtable_stats, + PLUGIN_VAR_RQCMDARG, + "Force to always compute memtable stats", nullptr, + nullptr, TRUE); static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime, rocksdb_force_compute_memtable_stats_cachetime, @@ -1372,6 +1591,10 @@ static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name, rocksdb_compact_column_family, rocksdb_compact_column_family_stub, ""); +static MYSQL_SYSVAR_STR(delete_cf, rocksdb_delete_cf_name, PLUGIN_VAR_RQCMDARG, + "Delete column family", rocksdb_delete_column_family, + rocksdb_delete_column_family_stub, ""); + static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name, PLUGIN_VAR_RQCMDARG, "Checkpoint directory", rocksdb_create_checkpoint, @@ -1438,6 +1661,12 @@ static MYSQL_SYSVAR_UINT( "Maximum number of pending + ongoing number of manual compactions.", nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0); +static MYSQL_SYSVAR_BOOL( + rollback_on_timeout, rocksdb_rollback_on_timeout, PLUGIN_VAR_OPCMDARG, + "Whether to roll back the complete transaction or a single statement on " + "lock wait timeout (a single statement by default)", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_UINT( debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay, PLUGIN_VAR_RQCMDARG, @@ -1529,7 +1758,7 @@ static MYSQL_SYSVAR_LONGLONG( rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG, "Minimum file size required for compaction_sequential_deletes", nullptr, rocksdb_set_compaction_options, 0L, - /* min */ -1L, /* max */ LONGLONG_MAX, 0); + /* min */ -1L, /* max */ LLONG_MAX, 0); static MYSQL_SYSVAR_BOOL( compaction_sequential_deletes_count_sd, @@ -1627,6 +1856,13 @@ static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation, "collation is used", nullptr, nullptr, TRUE); +static MYSQL_SYSVAR_BOOL( + enable_insert_with_update_caching, + rocksdb_enable_insert_with_update_caching, PLUGIN_VAR_OPCMDARG, + "Whether to enable optimization where we cache the read from a failed " + "insertion attempt in INSERT ON DUPLICATE KEY UPDATE", + nullptr, nullptr, TRUE); + static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; static struct st_mysql_sys_var *rocksdb_system_variables[] = { @@ -1645,6 +1881,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(commit_in_the_middle), MYSQL_SYSVAR(blind_delete_primary_key), MYSQL_SYSVAR(read_free_rpl_tables), + MYSQL_SYSVAR(read_free_rpl), MYSQL_SYSVAR(bulk_load_size), MYSQL_SYSVAR(merge_buf_size), MYSQL_SYSVAR(enable_bulk_load_api), @@ -1696,6 +1933,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(enable_thread_tracking), MYSQL_SYSVAR(perf_context_level), MYSQL_SYSVAR(wal_recovery_mode), + MYSQL_SYSVAR(stats_level), MYSQL_SYSVAR(access_hint_on_compaction_start), MYSQL_SYSVAR(new_table_reader_for_compaction_inputs), MYSQL_SYSVAR(compaction_readahead_size), @@ -1705,7 +1943,10 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(block_cache_size), MYSQL_SYSVAR(sim_cache_size), MYSQL_SYSVAR(use_clock_cache), + MYSQL_SYSVAR(cache_high_pri_pool_ratio), + MYSQL_SYSVAR(cache_dump), MYSQL_SYSVAR(cache_index_and_filter_blocks), + MYSQL_SYSVAR(cache_index_and_filter_with_high_priority), MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache), MYSQL_SYSVAR(index_type), MYSQL_SYSVAR(hash_index_allow_collision), @@ -1734,6 +1975,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality), MYSQL_SYSVAR(compact_cf), + MYSQL_SYSVAR(delete_cf), MYSQL_SYSVAR(signal_drop_index_thread), MYSQL_SYSVAR(pause_background_work), MYSQL_SYSVAR(enable_2pc), @@ -1777,10 +2019,13 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(debug_manual_compaction_delay), MYSQL_SYSVAR(max_manual_compactions), MYSQL_SYSVAR(manual_compaction_threads), + MYSQL_SYSVAR(rollback_on_timeout), + + MYSQL_SYSVAR(enable_insert_with_update_caching), nullptr}; -static rocksdb::WriteOptions -rdb_get_rocksdb_write_options(my_core::THD *const thd) { +static rocksdb::WriteOptions rdb_get_rocksdb_write_options( + my_core::THD *const thd) { rocksdb::WriteOptions opt; opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); @@ -1837,19 +2082,6 @@ static int rocksdb_compact_column_family(THD *const thd, /////////////////////////////////////////////////////////////////////////////////////////// -/** - @brief - Function we use in the creation of our hash to get key. -*/ - -uchar * -Rdb_open_tables_map::get_hash_key(Rdb_table_handler *const table_handler, - size_t *const length, - my_bool not_used MY_ATTRIBUTE((__unused__))) { - *length = table_handler->m_table_name_length; - return reinterpret_cast(table_handler->m_table_name); -} - /* Drop index thread's control */ @@ -1906,7 +2138,7 @@ class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier { void SnapshotCreated(const rocksdb::Snapshot *snapshot) override; -public: + public: Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete; Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete; @@ -1919,9 +2151,9 @@ public: }; /* This is the base class for transactions when interacting with rocksdb. -*/ + */ class Rdb_transaction { -protected: + protected: ulonglong m_write_count = 0; ulonglong m_insert_count = 0; ulonglong m_update_count = 0; @@ -1932,7 +2164,7 @@ protected: bool m_is_delayed_snapshot = false; bool m_is_two_phase = false; -private: + private: /* Number of write operations this transaction had when we took the last savepoint (the idea is not to take another savepoint if we haven't made @@ -1940,7 +2172,7 @@ private: */ ulonglong m_writes_at_last_savepoint; -protected: + protected: THD *m_thd = nullptr; static std::multiset s_tx_list; @@ -1963,9 +2195,9 @@ protected: // This should be used only when updating binlog information. virtual rocksdb::WriteBatchBase *get_write_batch() = 0; virtual bool commit_no_binlog() = 0; - virtual rocksdb::Iterator * - get_iterator(const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *column_family) = 0; + virtual rocksdb::Iterator *get_iterator( + const rocksdb::ReadOptions &options, + rocksdb::ColumnFamilyHandle *column_family) = 0; /* @detail @@ -1990,7 +2222,7 @@ protected: return s; } -protected: + protected: /* The following two are helper functions to be overloaded by child classes. They should provide RocksDB's savepoint semantics. @@ -2035,8 +2267,9 @@ protected: RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex); - for (auto it : s_tx_list) + for (auto it : s_tx_list) { walker->process_tran(it); + } RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex); } @@ -2056,7 +2289,8 @@ protected: convert_error_code_to_mysql() does: force a statement rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT: */ - my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/); + my_core::thd_mark_transaction_to_rollback( + thd, static_cast(rocksdb_rollback_on_timeout)); m_detailed_error.copy(timeout_message( "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str())); table_handler->m_lock_wait_timeout_counter.inc(); @@ -2078,9 +2312,10 @@ protected: char user_host_buff[MAX_USER_HOST_SIZE + 1]; make_user_name(thd, user_host_buff); // NO_LINT_DEBUG - sql_print_warning("Got snapshot conflict errors: User: %s " - "Query: %s", - user_host_buff, thd->query()); + sql_print_warning( + "Got snapshot conflict errors: User: %s " + "Query: %s", + user_host_buff, thd->query()); } m_detailed_error = String(" (snapshot conflict)", system_charset_info); table_handler->m_deadlock_counter.inc(); @@ -2177,8 +2412,9 @@ protected: if (m_is_tx_failed) { rollback(); res = false; - } else + } else { res = commit(); + } return res; } @@ -2224,7 +2460,7 @@ protected: bool has_snapshot() const { return m_read_opts.snapshot != nullptr; } -private: + private: // The Rdb_sst_info structures we are currently loading. In a partitioned // table this can have more than one entry std::vector> m_curr_bulk_load; @@ -2233,7 +2469,7 @@ private: /* External merge sorts for bulk load: key ID -> merge sort instance */ std::unordered_map m_key_merge; -public: + public: int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf, Rdb_index_merge **key_merge) { int res; @@ -2254,22 +2490,62 @@ public: return HA_EXIT_SUCCESS; } - int finish_bulk_load(int print_client_error = true) { - int rc = 0, rc2; + /* Finish bulk loading for all table handlers belongs to one connection */ + int finish_bulk_load(bool *is_critical_error = nullptr, + int print_client_error = true) { + Ensure_cleanup cleanup([&]() { + // Always clear everything regardless of success/failure + m_curr_bulk_load.clear(); + m_curr_bulk_load_tablename.clear(); + m_key_merge.clear(); + }); + + int rc = 0; + if (is_critical_error) { + *is_critical_error = true; + } + + // PREPARE phase: finish all on-going bulk loading Rdb_sst_info and + // collect all Rdb_sst_commit_info containing (SST files, cf) + int rc2 = 0; + std::vector sst_commit_list; + sst_commit_list.reserve(m_curr_bulk_load.size()); - std::vector>::iterator it; - for (it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end(); it++) { - rc2 = (*it)->commit(print_client_error); - if (rc2 != 0 && rc == 0) { + for (auto &sst_info : m_curr_bulk_load) { + Rdb_sst_info::Rdb_sst_commit_info commit_info; + + // Commit the list of SST files and move it to the end of + // sst_commit_list, effectively transfer the ownership over + rc2 = sst_info->finish(&commit_info, print_client_error); + if (rc2 && rc == 0) { + // Don't return yet - make sure we finish all the SST infos rc = rc2; } + + // Make sure we have work to do - we might be losing the race + if (rc2 == 0 && commit_info.has_work()) { + sst_commit_list.emplace_back(std::move(commit_info)); + DBUG_ASSERT(!commit_info.has_work()); + } + } + + if (rc) { + return rc; } - m_curr_bulk_load.clear(); - m_curr_bulk_load_tablename.clear(); - DBUG_ASSERT(m_curr_bulk_load.size() == 0); - // Flush the index_merge sort buffers + // MERGING Phase: Flush the index_merge sort buffers into SST files in + // Rdb_sst_info and collect all Rdb_sst_commit_info containing + // (SST files, cf) if (!m_key_merge.empty()) { + Ensure_cleanup malloc_cleanup([]() { + /* + Explicitly tell jemalloc to clean up any unused dirty pages at this + point. + See https://reviews.facebook.net/D63723 for more details. + */ + purge_all_jemalloc_arenas(); + }); + rocksdb::Slice merge_key; rocksdb::Slice merge_val; for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) { @@ -2286,9 +2562,20 @@ public: // be missed by the compaction filter and not be marked for // removal. It is unclear how to lock the sql table from the storage // engine to prevent modifications to it while bulk load is occurring. - if (keydef == nullptr || table_name.empty()) { - rc2 = HA_ERR_ROCKSDB_BULK_LOAD; - break; + if (keydef == nullptr) { + if (is_critical_error) { + // We used to set the error but simply ignores it. This follows + // current behavior and we should revisit this later + *is_critical_error = false; + } + return HA_ERR_KEY_NOT_FOUND; + } else if (table_name.empty()) { + if (is_critical_error) { + // We used to set the error but simply ignores it. This follows + // current behavior and we should revisit this later + *is_critical_error = false; + } + return HA_ERR_NO_SUCH_TABLE; } const std::string &index_name = keydef->get_name(); Rdb_index_merge &rdb_merge = it->second; @@ -2297,38 +2584,112 @@ public: // "./database/table" std::replace(table_name.begin(), table_name.end(), '.', '/'); table_name = "./" + table_name; - Rdb_sst_info sst_info(rdb, table_name, index_name, rdb_merge.get_cf(), - *rocksdb_db_options, - THDVAR(get_thd(), trace_sst_api)); + auto sst_info = std::make_shared( + rdb, table_name, index_name, rdb_merge.get_cf(), + *rocksdb_db_options, THDVAR(get_thd(), trace_sst_api)); while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) { - if ((rc2 = sst_info.put(merge_key, merge_val)) != 0) { + if ((rc2 = sst_info->put(merge_key, merge_val)) != 0) { + rc = rc2; + + // Don't return yet - make sure we finish the sst_info break; } } - // rc2 == -1 => finished ok; rc2 > 0 => error - if (rc2 > 0 || (rc2 = sst_info.commit(print_client_error)) != 0) { - if (rc == 0) { - rc = rc2; - } - break; + // -1 => no more items + if (rc2 != -1 && rc != 0) { + rc = rc2; + } + + Rdb_sst_info::Rdb_sst_commit_info commit_info; + rc2 = sst_info->finish(&commit_info, print_client_error); + if (rc2 != 0 && rc == 0) { + // Only set the error from sst_info->finish if finish failed and we + // didn't fail before. In other words, we don't have finish's + // success mask earlier failures + rc = rc2; + } + + if (rc) { + return rc; + } + + if (commit_info.has_work()) { + sst_commit_list.emplace_back(std::move(commit_info)); + DBUG_ASSERT(!commit_info.has_work()); } } - m_key_merge.clear(); + } - /* - Explicitly tell jemalloc to clean up any unused dirty pages at this - point. - See https://reviews.facebook.net/D63723 for more details. - */ - purge_all_jemalloc_arenas(); + // Early return in case we lost the race completely and end up with no + // work at all + if (sst_commit_list.size() == 0) { + return rc; + } + + // INGEST phase: Group all Rdb_sst_commit_info by cf (as they might + // have the same cf across different indexes) and call out to RocksDB + // to ingest all SST files in one atomic operation + rocksdb::IngestExternalFileOptions options; + options.move_files = true; + options.snapshot_consistency = false; + options.allow_global_seqno = false; + options.allow_blocking_flush = false; + + std::map + arg_map; + + // Group by column_family + for (auto &commit_info : sst_commit_list) { + if (arg_map.find(commit_info.get_cf()) == arg_map.end()) { + rocksdb::IngestExternalFileArg arg; + arg.column_family = commit_info.get_cf(), + arg.external_files = commit_info.get_committed_files(), + arg.options = options; + + arg_map.emplace(commit_info.get_cf(), arg); + } else { + auto &files = arg_map[commit_info.get_cf()].external_files; + files.insert(files.end(), commit_info.get_committed_files().begin(), + commit_info.get_committed_files().end()); + } + } + + std::vector args; + size_t file_count = 0; + for (auto &cf_files_pair : arg_map) { + args.push_back(cf_files_pair.second); + file_count += cf_files_pair.second.external_files.size(); } + + const rocksdb::Status s = rdb->IngestExternalFiles(args); + if (THDVAR(m_thd, trace_sst_api)) { + // NO_LINT_DEBUG + sql_print_information( + "SST Tracing: IngestExternalFile '%zu' files returned %s", file_count, + s.ok() ? "ok" : "not ok"); + } + + if (!s.ok()) { + if (print_client_error) { + Rdb_sst_info::report_error_msg(s, nullptr); + } + return HA_ERR_ROCKSDB_BULK_LOAD; + } + + // COMMIT phase: mark everything as completed. This avoids SST file + // deletion kicking in. Otherwise SST files would get deleted if this + // entire operation is aborted + for (auto &commit_info : sst_commit_list) { + commit_info.commit(); + } + return rc; } int start_bulk_load(ha_rocksdb *const bulk_load, - std::shared_ptr sst_info) { + std::shared_ptr sst_info) { /* If we already have an open bulk load of a table and the name doesn't match the current one, close out the currently running one. This allows @@ -2341,8 +2702,6 @@ public: bulk_load->get_table_basename() != m_curr_bulk_load_tablename) { const auto res = finish_bulk_load(); if (res != HA_EXIT_SUCCESS) { - m_curr_bulk_load.clear(); - m_curr_bulk_load_tablename.clear(); return res; } } @@ -2392,12 +2751,10 @@ public: inserts while inside a multi-statement transaction. */ bool flush_batch() { - if (get_write_count() == 0) - return false; + if (get_write_count() == 0) return false; /* Commit the current transaction */ - if (commit_no_binlog()) - return true; + if (commit_no_binlog()) return true; /* Start another one */ start_tx(); @@ -2409,7 +2766,7 @@ public: std::max(m_auto_incr_map[gl_index_id], curr_id); } -#ifndef NDEBUG +#ifndef DBUG_OFF ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) { if (m_auto_incr_map.count(gl_index_id) > 0) { return m_auto_incr_map[gl_index_id]; @@ -2420,13 +2777,14 @@ public: virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key, - const rocksdb::Slice &value) = 0; - virtual rocksdb::Status - delete_key(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) = 0; - virtual rocksdb::Status - single_delete(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) = 0; + const rocksdb::Slice &value, + const bool assume_tracked) = 0; + virtual rocksdb::Status delete_key( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, const bool assume_tracked) = 0; + virtual rocksdb::Status single_delete( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, const bool assume_tracked) = 0; virtual bool has_modifications() const = 0; @@ -2442,25 +2800,23 @@ public: virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const = 0; - virtual rocksdb::Status - get_for_update(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, - bool exclusive) = 0; - - rocksdb::Iterator * - get_iterator(rocksdb::ColumnFamilyHandle *const column_family, - bool skip_bloom_filter, bool fill_cache, - const rocksdb::Slice &eq_cond_lower_bound, - const rocksdb::Slice &eq_cond_upper_bound, - bool read_current = false, bool create_snapshot = true) { + virtual rocksdb::Status get_for_update( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, + bool exclusive, const bool do_validate) = 0; + + rocksdb::Iterator *get_iterator( + rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter, + bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound, + const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false, + bool create_snapshot = true) { // Make sure we are not doing both read_current (which implies we don't // want a snapshot) and create_snapshot which makes sure we create // a snapshot DBUG_ASSERT(column_family != nullptr); DBUG_ASSERT(!read_current || !create_snapshot); - if (create_snapshot) - acquire_snapshot(true); + if (create_snapshot) acquire_snapshot(true); rocksdb::ReadOptions options = m_read_opts; @@ -2492,25 +2848,33 @@ public: entire transaction. */ do_set_savepoint(); - m_writes_at_last_savepoint= m_write_count; + m_writes_at_last_savepoint = m_write_count; } /* Called when a "top-level" statement inside a transaction completes successfully and its changes become part of the transaction's changes. */ - void make_stmt_savepoint_permanent() { - + int make_stmt_savepoint_permanent() { // Take another RocksDB savepoint only if we had changes since the last // one. This is very important for long transactions doing lots of // SELECTs. - if (m_writes_at_last_savepoint != m_write_count) - { + if (m_writes_at_last_savepoint != m_write_count) { + rocksdb::WriteBatchBase *batch = get_write_batch(); + rocksdb::Status status = rocksdb::Status::NotFound(); + while ((status = batch->PopSavePoint()) == rocksdb::Status::OK()) { + } + + if (status != rocksdb::Status::NotFound()) { + return HA_EXIT_FAILURE; + } + do_set_savepoint(); - m_writes_at_last_savepoint= m_write_count; + m_writes_at_last_savepoint = m_write_count; } - } + return HA_EXIT_SUCCESS; + } /* Rollback to the savepoint we've set before the last statement @@ -2526,7 +2890,7 @@ public: statement start) because setting a savepoint is cheap. */ do_set_savepoint(); - m_writes_at_last_savepoint= m_write_count; + m_writes_at_last_savepoint = m_write_count; } } @@ -2590,10 +2954,11 @@ class Rdb_transaction_impl : public Rdb_transaction { rocksdb::Transaction *m_rocksdb_tx = nullptr; rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr; -public: + public: void set_lock_timeout(int timeout_sec_arg) override { - if (m_rocksdb_tx) + if (m_rocksdb_tx) { m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec)); + } } void set_sync(bool sync) override { @@ -2609,7 +2974,7 @@ public: virtual bool is_writebatch_trx() const override { return false; } -private: + private: void release_tx(void) { // We are done with the current active transaction object. Preserve it // for later reuse. @@ -2659,7 +3024,7 @@ private: goto error; } -error: + error: /* Save the transaction object to be reused */ release_tx(); @@ -2673,7 +3038,7 @@ error: return res; } -public: + public: void rollback() override { m_write_count = 0; m_insert_count = 0; @@ -2734,39 +3099,42 @@ public: m_read_opts.snapshot = nullptr; } - if (need_clear && m_rocksdb_tx != nullptr) - m_rocksdb_tx->ClearSnapshot(); + if (need_clear && m_rocksdb_tx != nullptr) m_rocksdb_tx->ClearSnapshot(); } bool has_snapshot() { return m_read_opts.snapshot != nullptr; } rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, - const rocksdb::Slice &value) override { + const rocksdb::Slice &key, const rocksdb::Slice &value, + const bool assume_tracked) override { ++m_write_count; ++m_lock_count; - if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) + if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) { return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); - return m_rocksdb_tx->Put(column_family, key, value); + } + return m_rocksdb_tx->Put(column_family, key, value, assume_tracked); } rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) override { + const rocksdb::Slice &key, + const bool assume_tracked) override { ++m_write_count; ++m_lock_count; - if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) + if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) { return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); - return m_rocksdb_tx->Delete(column_family, key); + } + return m_rocksdb_tx->Delete(column_family, key, assume_tracked); } - rocksdb::Status - single_delete(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) override { + rocksdb::Status single_delete( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, const bool assume_tracked) override { ++m_write_count; ++m_lock_count; - if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) + if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) { return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); - return m_rocksdb_tx->SingleDelete(column_family, key); + } + return m_rocksdb_tx->SingleDelete(column_family, key, assume_tracked); } bool has_modifications() const override { @@ -2802,23 +3170,39 @@ public: return m_rocksdb_tx->Get(m_read_opts, column_family, key, value); } - rocksdb::Status - get_for_update(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, - bool exclusive) override { - if (++m_lock_count > m_max_row_locks) + rocksdb::Status get_for_update( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, + bool exclusive, const bool do_validate) override { + if (++m_lock_count > m_max_row_locks) { return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); + } if (value != nullptr) { value->Reset(); } - return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value, - exclusive); + rocksdb::Status s; + // If snapshot is null, pass it to GetForUpdate and snapshot is + // initialized there. Snapshot validation is skipped in that case. + if (m_read_opts.snapshot == nullptr || do_validate) { + s = m_rocksdb_tx->GetForUpdate( + m_read_opts, column_family, key, value, exclusive, + m_read_opts.snapshot ? do_validate : false); + } else { + // If snapshot is set, and if skipping validation, + // call GetForUpdate without validation and set back old snapshot + auto saved_snapshot = m_read_opts.snapshot; + m_read_opts.snapshot = nullptr; + s = m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value, + exclusive, false); + m_read_opts.snapshot = saved_snapshot; + } + return s; } - rocksdb::Iterator * - get_iterator(const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const column_family) override { + rocksdb::Iterator *get_iterator( + const rocksdb::ReadOptions &options, + rocksdb::ColumnFamilyHandle *const column_family) override { global_stats.queries[QUERIES_RANGE].inc(); return m_rocksdb_tx->GetIterator(options, column_family); } @@ -2861,10 +3245,9 @@ public: m_ddl_transaction = false; } - /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints */ - void do_set_savepoint() override { - m_rocksdb_tx->SetSavePoint(); - } + /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints + */ + void do_set_savepoint() override { m_rocksdb_tx->SetSavePoint(); } void do_rollback_to_savepoint() override { m_rocksdb_tx->RollbackToSavePoint(); @@ -2896,14 +3279,14 @@ public: const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot(); if (org_snapshot != cur_snapshot) { - if (org_snapshot != nullptr) - m_snapshot_timestamp = 0; + if (org_snapshot != nullptr) m_snapshot_timestamp = 0; m_read_opts.snapshot = cur_snapshot; - if (cur_snapshot != nullptr) + if (cur_snapshot != nullptr) { rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); - else + } else { m_is_delayed_snapshot = true; + } } } } @@ -2914,7 +3297,7 @@ public: m_notifier = std::make_shared(this); } - virtual ~Rdb_transaction_impl() { + virtual ~Rdb_transaction_impl() override { rollback(); // Theoretically the notifier could outlive the Rdb_transaction_impl @@ -2946,7 +3329,7 @@ class Rdb_writebatch_impl : public Rdb_transaction { m_ddl_transaction = false; } -private: + private: bool prepare(const rocksdb::TransactionName &name) override { return true; } bool commit_no_binlog() override { @@ -2970,7 +3353,7 @@ private: res = true; goto error; } -error: + error: reset(); m_write_count = 0; @@ -2983,15 +3366,11 @@ error: } /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */ - void do_set_savepoint() override { - m_batch->SetSavePoint(); - } + void do_set_savepoint() override { m_batch->SetSavePoint(); } - void do_rollback_to_savepoint() override { - m_batch->RollbackToSavePoint(); - } + void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); } -public: + public: bool is_writebatch_trx() const override { return true; } void set_lock_timeout(int timeout_sec_arg) override { @@ -3019,8 +3398,7 @@ public: } void acquire_snapshot(bool acquire_now) override { - if (m_read_opts.snapshot == nullptr) - snapshot_created(rdb->GetSnapshot()); + if (m_read_opts.snapshot == nullptr) snapshot_created(rdb->GetSnapshot()); } void release_snapshot() override { @@ -3031,8 +3409,8 @@ public: } rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, - const rocksdb::Slice &value) override { + const rocksdb::Slice &key, const rocksdb::Slice &value, + const bool assume_tracked) override { ++m_write_count; m_batch->Put(column_family, key, value); // Note Put/Delete in write batch doesn't return any error code. We simply @@ -3041,15 +3419,16 @@ public: } rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) override { + const rocksdb::Slice &key, + const bool assume_tracked) override { ++m_write_count; m_batch->Delete(column_family, key); return rocksdb::Status::OK(); } - rocksdb::Status - single_delete(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key) override { + rocksdb::Status single_delete( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, const bool /* assume_tracked */) override { ++m_write_count; m_batch->SingleDelete(column_family, key); return rocksdb::Status::OK(); @@ -3074,10 +3453,10 @@ public: value); } - rocksdb::Status - get_for_update(rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, - bool exclusive) override { + rocksdb::Status get_for_update( + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, + bool /* exclusive */, const bool /* do_validate */) override { if (value == nullptr) { rocksdb::PinnableSlice pin_val; rocksdb::Status s = get(column_family, key, &pin_val); @@ -3088,9 +3467,9 @@ public: return get(column_family, key, value); } - rocksdb::Iterator * - get_iterator(const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const column_family) override { + rocksdb::Iterator *get_iterator( + const rocksdb::ReadOptions &options, + rocksdb::ColumnFamilyHandle *const /* column_family */) override { const auto it = rdb->NewIterator(options); return m_batch->NewIteratorWithBase(it); } @@ -3110,8 +3489,7 @@ public: void start_stmt() override {} void rollback_stmt() override { - if (m_batch) - rollback_to_stmt_savepoint(); + if (m_batch) rollback_to_stmt_savepoint(); } explicit Rdb_writebatch_impl(THD *const thd) @@ -3120,7 +3498,7 @@ public: true); } - virtual ~Rdb_writebatch_impl() { + virtual ~Rdb_writebatch_impl() override { rollback(); delete m_batch; } @@ -3178,7 +3556,7 @@ class Rdb_perf_context_guard { } }; -} // anonymous namespace +} // anonymous namespace /* TODO: maybe, call this in external_lock() and store in ha_rocksdb.. @@ -3189,12 +3567,9 @@ static Rdb_transaction *get_or_create_tx(THD *const thd) { // TODO: this is called too many times.. O(#rows) if (tx == nullptr) { if ((rpl_skip_tx_api && thd->rli_slave) || - (THDVAR(thd, master_skip_tx_api) && !thd->rli_slave)) - { + (THDVAR(thd, master_skip_tx_api) && !thd->rli_slave)) { tx = new Rdb_writebatch_impl(thd); - } - else - { + } else { tx = new Rdb_transaction_impl(thd); } tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks)); @@ -3212,12 +3587,14 @@ static Rdb_transaction *get_or_create_tx(THD *const thd) { static int rocksdb_close_connection(handlerton *const hton, THD *const thd) { Rdb_transaction *&tx = get_tx_from_thd(thd); if (tx != nullptr) { - int rc = tx->finish_bulk_load(false); - if (rc != 0) { + bool is_critical_error; + int rc = tx->finish_bulk_load(&is_critical_error, false); + if (rc != 0 && is_critical_error) { // NO_LINT_DEBUG - sql_print_error("RocksDB: Error %d finalizing last SST file while " - "disconnecting", - rc); + sql_print_error( + "RocksDB: Error %d finalizing last SST file while " + "disconnecting", + rc); } delete tx; @@ -3319,9 +3696,9 @@ static int rocksdb_prepare(handlerton *const hton, THD *const thd, } DEBUG_SYNC(thd, "rocksdb.prepared"); - } - else + } else { tx->make_stmt_savepoint_permanent(); + } return HA_EXIT_SUCCESS; } @@ -3363,9 +3740,8 @@ static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) { DBUG_RETURN(HA_EXIT_SUCCESS); } -static int -rocksdb_rollback_by_xid(handlerton *const hton MY_ATTRIBUTE((__unused__)), - XID *const xid) { +static int rocksdb_rollback_by_xid( + handlerton *const hton MY_ATTRIBUTE((__unused__)), XID *const xid) { DBUG_ENTER_FUNC(); DBUG_ASSERT(hton != nullptr); @@ -3411,6 +3787,7 @@ static void rdb_xid_from_string(const std::string &src, XID *const dst) { DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE); DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE); + memset(dst->data, 0, XIDDATASIZE); src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length), RDB_XIDHDR_LEN); } @@ -3431,13 +3808,16 @@ static int rocksdb_recover(handlerton *const hton, XID *const xid_list, if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) { memcpy(binlog_file, file_buf, FN_REFLEN + 1); *binlog_pos = pos; - fprintf(stderr, "RocksDB: Last binlog file position %llu," - " file name %s\n", + // NO_LINT_DEBUG + fprintf(stderr, + "RocksDB: Last binlog file position %llu," + " file name %s\n", pos, file_buf); if (*gtid_buf) { global_sid_lock->rdlock(); binlog_max_gtid->parse(global_sid_map, gtid_buf); global_sid_lock->unlock(); + // NO_LINT_DEBUG fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf); } } @@ -3480,8 +3860,8 @@ static int rocksdb_commit(handlerton *const hton, THD *const thd, Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); if (tx != nullptr) { - if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | - OPTION_BEGIN))) { + if (commit_tx || (!my_core::thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { /* We get here - For a COMMIT statement that finishes a multi-statement transaction @@ -3578,7 +3958,7 @@ static std::string format_string(const char *const format, ...) { char *buff = static_buff; std::unique_ptr dynamic_buff = nullptr; - len++; // Add one for null terminator + len++; // Add one for null terminator // for longer output use an allocated buffer if (static_cast(len) > sizeof(static_buff)) { @@ -3603,7 +3983,7 @@ static std::string format_string(const char *const format, ...) { } class Rdb_snapshot_status : public Rdb_tx_list_walker { -private: + private: std::string m_data; static std::string current_timestamp(void) { @@ -3637,9 +4017,8 @@ private: "=========================================\n"; } - static Rdb_deadlock_info::Rdb_dl_trx_info - get_dl_txn_info(const rocksdb::DeadlockInfo &txn, - const GL_INDEX_ID &gl_index_id) { + static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info( + const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) { Rdb_deadlock_info::Rdb_dl_trx_info txn_data; txn_data.trx_id = txn.m_txn_id; @@ -3666,13 +4045,12 @@ private: return txn_data; } - static Rdb_deadlock_info - get_dl_path_trx_info(const rocksdb::DeadlockPath &path_entry) { + static Rdb_deadlock_info get_dl_path_trx_info( + const rocksdb::DeadlockPath &path_entry) { Rdb_deadlock_info deadlock_info; - for (auto it = path_entry.path.begin(); it != path_entry.path.end(); - it++) { - auto txn = *it; + for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) { + const auto &txn = *it; const GL_INDEX_ID gl_index_id = { txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast( txn.m_waiting_key.c_str()))}; @@ -3681,7 +4059,7 @@ private: DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty()); /* print the first txn in the path to display the full deadlock cycle */ if (!path_entry.path.empty() && !path_entry.limit_exceeded) { - auto deadlocking_txn = *(path_entry.path.end() - 1); + const auto &deadlocking_txn = *(path_entry.path.end() - 1); deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id; deadlock_info.deadlock_time = path_entry.deadlock_time; } @@ -3709,7 +4087,7 @@ private: thd_security_context(thd, buffer, sizeof buffer, 0); m_data += format_string( "---SNAPSHOT, ACTIVE %lld sec\n" - "%s\n" + "%s\n" "lock count %llu, write count %llu\n" "insert count %llu, update count %llu, delete count %llu\n", curr_time - snapshot_timestamp, buffer, tx->get_lock_count(), @@ -3722,19 +4100,21 @@ private: auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); m_data += "----------LATEST DETECTED DEADLOCKS----------\n"; - for (auto path_entry : dlock_buffer) { + for (const auto &path_entry : dlock_buffer) { std::string path_data; if (path_entry.limit_exceeded) { path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n"; } else { - path_data += "\n*** DEADLOCK PATH\n" - "=========================================\n"; + path_data += + "\n*** DEADLOCK PATH\n" + "=========================================\n"; const auto dl_info = get_dl_path_trx_info(path_entry); const auto deadlock_time = dl_info.deadlock_time; for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) { - const auto trx_info = *it; + const auto &trx_info = *it; path_data += format_string( - "TIMESTAMP: %" PRId64 "\n" + "TIMESTAMP: %" PRId64 + "\n" "TRANSACTION ID: %u\n" "COLUMN FAMILY NAME: %s\n" "WAITING KEY: %s\n" @@ -3749,9 +4129,9 @@ private: path_data += "---------------WAITING FOR---------------\n"; } } - path_data += - format_string("\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n", - dl_info.victim_trx_id); + path_data += format_string( + "\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n", + dl_info.victim_trx_id); } m_data += path_data; } @@ -3760,7 +4140,7 @@ private: std::vector get_deadlock_info() { std::vector deadlock_info; auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); - for (auto path_entry : dlock_buffer) { + for (const auto &path_entry : dlock_buffer) { if (!path_entry.limit_exceeded) { deadlock_info.push_back(get_dl_path_trx_info(path_entry)); } @@ -3775,10 +4155,10 @@ private: * out relevant information for information_schema.rocksdb_trx */ class Rdb_trx_info_aggregator : public Rdb_tx_list_walker { -private: + private: std::vector *m_trx_info; -public: + public: explicit Rdb_trx_info_aggregator(std::vector *const trx_info) : m_trx_info(trx_info) {} @@ -3908,9 +4288,10 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, // sure that output will look unified. DBUG_ASSERT(commit_latency_stats != nullptr); - snprintf(buf, sizeof(buf), "rocksdb.commit_latency statistics " - "Percentiles :=> 50 : %.2f 95 : %.2f " - "99 : %.2f 100 : %.2f\n", + snprintf(buf, sizeof(buf), + "rocksdb.commit_latency statistics " + "Percentiles :=> 50 : %.2f 95 : %.2f " + "99 : %.2f 100 : %.2f\n", commit_latency_stats->Percentile(50), commit_latency_stats->Percentile(95), commit_latency_stats->Percentile(99), @@ -3932,8 +4313,9 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, } if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) { - snprintf(buf, sizeof(buf), "rocksdb.actual_delayed_write_rate " - "COUNT : %lu\n", + snprintf(buf, sizeof(buf), + "rocksdb.actual_delayed_write_rate " + "COUNT : %lu\n", v); str.append(buf); } @@ -4021,6 +4403,7 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list); if (!s.ok()) { + // NO_LINT_DEBUG sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n", s.ToString().c_str()); res |= true; @@ -4037,36 +4420,22 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, "\noperation_type: " + it.GetOperationName(it.operation_type) + "\noperation_stage: " + it.GetOperationStageName(it.operation_stage) + - "\nelapsed_time_ms: " + - it.MicrosToString(it.op_elapsed_micros); + "\nelapsed_time_ms: " + it.MicrosToString(it.op_elapsed_micros); - for (auto &it_props : - it.InterpretOperationProperties(it.operation_type, - it.op_properties)) { + for (auto &it_props : it.InterpretOperationProperties( + it.operation_type, it.op_properties)) { str += "\n" + it_props.first + ": " + std::to_string(it_props.second); } str += "\nstate_type: " + it.GetStateName(it.state_type); - res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), - str, stat_print); + res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id), str, + stat_print); } } /* Explicit snapshot information */ - str.clear(); - { - std::lock_guard lock(explicit_snapshot_mutex); - for (const auto &elem : explicit_snapshots) { - const auto &ss = elem.second.lock(); - DBUG_ASSERT(ss != nullptr); - const auto &info = ss->ss_info; - str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) + - "\nBinlog File: " + info.binlog_file + - "\nBinlog Pos: " + std::to_string(info.binlog_pos) + - "\nGtid Executed: " + info.gtid_executed + "\n"; - } - } + str = Rdb_explicit_snapshot::dump_snapshots(); if (!str.empty()) { res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print); } @@ -4095,38 +4464,38 @@ static bool rocksdb_explicit_snapshot( snapshot_info_st *ss_info) /*!< out: Snapshot information */ { switch (ss_info->op) { - case snapshot_operation::SNAPSHOT_CREATE: { - if (mysql_bin_log_is_open()) { - mysql_bin_log_lock_commits(ss_info); + case snapshot_operation::SNAPSHOT_CREATE: { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot()); + if (mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + thd->set_explicit_snapshot(s); + return s == nullptr; } - auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot()); - if (mysql_bin_log_is_open()) { - mysql_bin_log_unlock_commits(ss_info); + case snapshot_operation::SNAPSHOT_ATTACH: { + auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!s) { + return true; + } + *ss_info = s->ss_info; + thd->set_explicit_snapshot(s); + return false; } - - thd->set_explicit_snapshot(s); - return s == nullptr; - } - case snapshot_operation::SNAPSHOT_ATTACH: { - auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id); - if (!s) { - return true; + case snapshot_operation::SNAPSHOT_RELEASE: { + if (!thd->get_explicit_snapshot()) { + return true; + } + *ss_info = thd->get_explicit_snapshot()->ss_info; + thd->set_explicit_snapshot(nullptr); + return false; } - *ss_info = s->ss_info; - thd->set_explicit_snapshot(s); - return false; - } - case snapshot_operation::SNAPSHOT_RELEASE: { - if (!thd->get_explicit_snapshot()) { + default: + DBUG_ASSERT(false); return true; - } - *ss_info = thd->get_explicit_snapshot()->ss_info; - thd->set_explicit_snapshot(nullptr); - return false; - } - default: - DBUG_ASSERT(false); - return true; } return true; } @@ -4247,7 +4616,7 @@ static int rocksdb_start_tx_with_shared_read_view( // case: an explicit snapshot was not assigned to this transaction if (!tx->m_explicit_snapshot) { tx->m_explicit_snapshot = - Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot); + Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot); if (!tx->m_explicit_snapshot) { my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0)); error = HA_EXIT_FAILURE; @@ -4287,9 +4656,8 @@ static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd, return tx->rollback_to_savepoint(savepoint); } -static bool -rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton, - THD *const thd) { +static bool rocksdb_rollback_to_savepoint_can_release_mdl( + handlerton *const /* hton */, THD *const /* thd */) { return true; } @@ -4336,7 +4704,7 @@ static void rocksdb_update_table_stats( /* Function needs to return void because of the interface and we've * detected an error which shouldn't happen. There's no way to let * caller know that something failed. - */ + */ SHIP_ASSERT(false); return; } @@ -4416,8 +4784,9 @@ static rocksdb::Status check_rocksdb_options_compatibility( } if (loaded_cf_descs.size() != cf_descr.size()) { - return rocksdb::Status::NotSupported("Mismatched size of column family " - "descriptors."); + return rocksdb::Status::NotSupported( + "Mismatched size of column family " + "descriptors."); } // Please see RocksDB documentation for more context about why we need to set @@ -4455,17 +4824,22 @@ static int rocksdb_init_func(void *const p) { DBUG_ENTER_FUNC(); if (rdb_check_rocksdb_corruption()) { - sql_print_error("RocksDB: There was a corruption detected in RockDB files. " - "Check error log emitted earlier for more details."); + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: There was a corruption detected in RockDB files. " + "Check error log emitted earlier for more details."); if (rocksdb_allow_to_start_after_corruption) { + // NO_LINT_DEBUG sql_print_information( "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent " "server operating if RocksDB corruption is detected."); } else { - sql_print_error("RocksDB: The server will exit normally and stop restart " - "attempts. Remove %s file from data directory and " - "start mysqld manually.", - rdb_corruption_marker_file_name().c_str()); + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: The server will exit normally and stop restart " + "attempts. Remove %s file from data directory and " + "start mysqld manually.", + rdb_corruption_marker_file_name().c_str()); exit(0); } } @@ -4476,8 +4850,10 @@ static int rocksdb_init_func(void *const p) { init_rocksdb_psi_keys(); rocksdb_hton = (handlerton *)p; - mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex, - MY_MUTEX_INIT_FAST); + + rdb_open_tables.init(); + Ensure_cleanup rdb_open_tables_cleanup([]() { rdb_open_tables.free(); }); + #ifdef HAVE_PSI_INTERFACE rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key); rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key, @@ -4504,7 +4880,6 @@ static int rocksdb_init_func(void *const p) { MY_MUTEX_INIT_FAST); mysql_mutex_init(rdb_block_cache_resize_mutex_key, &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST); - rdb_open_tables.init_hash(); Rdb_transaction::init_mutex(); rocksdb_hton->state = SHOW_OPTION_YES; @@ -4528,6 +4903,7 @@ static int rocksdb_init_func(void *const p) { rocksdb_rollback_to_savepoint_can_release_mdl; rocksdb_hton->update_table_stats = rocksdb_update_table_stats; rocksdb_hton->flush_logs = rocksdb_flush_wal; + rocksdb_hton->handle_single_table_select = rocksdb_handle_single_table_select; rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED | HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE; @@ -4535,16 +4911,23 @@ static int rocksdb_init_func(void *const p) { DBUG_ASSERT(!mysqld_embedded); if (rocksdb_db_options->max_open_files > (long)open_files_limit) { - sql_print_information("RocksDB: rocksdb_max_open_files should not be " - "greater than the open_files_limit, effective value " - "of rocksdb_max_open_files is being set to " - "open_files_limit / 2."); + // NO_LINT_DEBUG + sql_print_information( + "RocksDB: rocksdb_max_open_files should not be " + "greater than the open_files_limit, effective value " + "of rocksdb_max_open_files is being set to " + "open_files_limit / 2."); rocksdb_db_options->max_open_files = open_files_limit / 2; } else if (rocksdb_db_options->max_open_files == -2) { rocksdb_db_options->max_open_files = open_files_limit / 2; } + rdb_read_free_regex_handler.set_patterns(DEFAULT_READ_FREE_RPL_TABLES); + rocksdb_stats = rocksdb::CreateDBStatistics(); + rocksdb_stats->set_stats_level( + static_cast(rocksdb_stats_level)); + rocksdb_stats_level = rocksdb_stats->get_stats_level(); rocksdb_db_options->statistics = rocksdb_stats; if (rocksdb_rate_limiter_bytes_per_sec != 0) { @@ -4578,14 +4961,15 @@ static int rocksdb_init_func(void *const p) { rocksdb_db_options->use_direct_reads) { // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG) - sql_print_error("RocksDB: Can't enable both use_direct_reads " - "and allow_mmap_reads\n"); - rdb_open_tables.free_hash(); + sql_print_error( + "RocksDB: Can't enable both use_direct_reads " + "and allow_mmap_reads\n"); DBUG_RETURN(HA_EXIT_FAILURE); } // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT - if (rocksdb_db_options->use_direct_reads) { + if (rocksdb_db_options->use_direct_reads || + rocksdb_db_options->use_direct_io_for_flush_and_compaction) { rocksdb::EnvOptions soptions; rocksdb::Status check_status; rocksdb::Env *const env = rocksdb_db_options->env; @@ -4606,9 +4990,11 @@ static int rocksdb_init_func(void *const p) { } if (!check_status.ok()) { - sql_print_error("RocksDB: Unable to use direct io in rocksdb-datadir:" - "(%s)", check_status.getState()); - rdb_open_tables.free_hash(); + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Unable to use direct io in rocksdb-datadir:" + "(%s)", + check_status.getState()); DBUG_RETURN(HA_EXIT_FAILURE); } } @@ -4616,18 +5002,19 @@ static int rocksdb_init_func(void *const p) { if (rocksdb_db_options->allow_mmap_writes && rocksdb_db_options->use_direct_io_for_flush_and_compaction) { // See above comment for allow_mmap_reads. (NO_LINT_DEBUG) - sql_print_error("RocksDB: Can't enable both " - "use_direct_io_for_flush_and_compaction and " - "allow_mmap_writes\n"); - rdb_open_tables.free_hash(); + sql_print_error( + "RocksDB: Can't enable both " + "use_direct_io_for_flush_and_compaction and " + "allow_mmap_writes\n"); DBUG_RETURN(HA_EXIT_FAILURE); } if (rocksdb_db_options->allow_mmap_writes && rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) { // NO_LINT_DEBUG - sql_print_error("RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 " - "to use allow_mmap_writes"); + sql_print_error( + "RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 " + "to use allow_mmap_writes"); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4649,17 +5036,21 @@ static int rocksdb_init_func(void *const p) { Checking system errno happens to work right now. */ if (status.IsIOError() && errno == ENOENT) { + // NO_LINT_DEBUG sql_print_information("RocksDB: Got ENOENT when listing column families"); + + // NO_LINT_DEBUG sql_print_information( "RocksDB: assuming that we're creating a new database"); } else { - rdb_open_tables.free_hash(); rdb_log_status_error(status, "Error listing column families"); DBUG_RETURN(HA_EXIT_FAILURE); } - } else + } else { + // NO_LINT_DEBUG sql_print_information("RocksDB: %ld column families found", cf_names.size()); + } std::vector cf_descr; std::vector cf_handles; @@ -4668,9 +5059,33 @@ static int rocksdb_init_func(void *const p) { (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type; if (!rocksdb_tbl_options->no_block_cache) { - std::shared_ptr block_cache = rocksdb_use_clock_cache - ? rocksdb::NewClockCache(rocksdb_block_cache_size) - : rocksdb::NewLRUCache(rocksdb_block_cache_size); + std::shared_ptr memory_allocator; + if (!rocksdb_cache_dump) { + size_t block_size = rocksdb_tbl_options->block_size; + rocksdb::JemallocAllocatorOptions alloc_opt; + // Limit jemalloc tcache memory usage. The range + // [block_size/4, block_size] should be enough to cover most of + // block cache allocation sizes. + alloc_opt.limit_tcache_size = true; + alloc_opt.tcache_size_lower_bound = block_size / 4; + alloc_opt.tcache_size_upper_bound = block_size; + rocksdb::Status new_alloc_status = + rocksdb::NewJemallocNodumpAllocator(alloc_opt, &memory_allocator); + if (!new_alloc_status.ok()) { + // Fallback to use default malloc/free. + rdb_log_status_error(new_alloc_status, + "Error excluding block cache from core dump"); + memory_allocator = nullptr; + DBUG_RETURN(HA_EXIT_FAILURE); + } + } + std::shared_ptr block_cache = + rocksdb_use_clock_cache + ? rocksdb::NewClockCache(rocksdb_block_cache_size) + : rocksdb::NewLRUCache( + rocksdb_block_cache_size, -1 /*num_shard_bits*/, + false /*strict_capcity_limit*/, + rocksdb_cache_high_pri_pool_ratio, memory_allocator); if (rocksdb_sim_cache_size > 0) { // Simulated cache enabled // Wrap block cache inside a simulated cache and pass it to RocksDB @@ -4705,7 +5120,7 @@ static int rocksdb_init_func(void *const p) { if (rocksdb_persistent_cache_size_mb > 0) { std::shared_ptr pcache; - uint64_t cache_size_bytes= rocksdb_persistent_cache_size_mb * 1024 * 1024; + uint64_t cache_size_bytes = rocksdb_persistent_cache_size_mb * 1024 * 1024; status = rocksdb::NewPersistentCache( rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path), cache_size_bytes, myrocks_logger, true, &pcache); @@ -4717,6 +5132,7 @@ static int rocksdb_init_func(void *const p) { } rocksdb_tbl_options->persistent_cache = pcache; } else if (strlen(rocksdb_persistent_cache_path)) { + // NO_LINT_DEBUG sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb"); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4727,7 +5143,6 @@ static int rocksdb_init_func(void *const p) { rocksdb_override_cf_options)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize CF options map."); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4735,17 +5150,23 @@ static int rocksdb_init_func(void *const p) { If there are no column families, we're creating the new database. Create one column family named "default". */ - if (cf_names.size() == 0) - cf_names.push_back(DEFAULT_CF_NAME); + if (cf_names.size() == 0) cf_names.push_back(DEFAULT_CF_NAME); std::vector compaction_enabled_cf_indices; + + // NO_LINT_DEBUG sql_print_information("RocksDB: Column Families at start:"); for (size_t i = 0; i < cf_names.size(); ++i) { rocksdb::ColumnFamilyOptions opts; cf_options_map->get_cf_options(cf_names[i], &opts); + // NO_LINT_DEBUG sql_print_information(" cf=%s", cf_names[i].c_str()); + + // NO_LINT_DEBUG sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size); + + // NO_LINT_DEBUG sql_print_information(" target_file_size_base=%" PRIu64, opts.target_file_size_base); @@ -4777,7 +5198,6 @@ static int rocksdb_init_func(void *const p) { if (!status.ok()) { rdb_log_status_error( status, "Compatibility check against existing database options failed"); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4786,7 +5206,6 @@ static int rocksdb_init_func(void *const p) { if (!status.ok()) { rdb_log_status_error(status, "Error opening instance"); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } cf_manager.init(std::move(cf_options_map), &cf_handles); @@ -4794,21 +5213,18 @@ static int rocksdb_init_func(void *const p) { if (dict_manager.init(rdb, &cf_manager)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize data dictionary."); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } if (binlog_manager.init(&dict_manager)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize binlog manager."); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize DDL manager."); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4828,33 +5244,32 @@ static int rocksdb_init_func(void *const p) { if (!status.ok()) { rdb_log_status_error(status, "Error enabling compaction"); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } - auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME -#ifdef HAVE_PSI_INTERFACE - , - rdb_background_psi_thread_key +#ifndef HAVE_PSI_INTERFACE + auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME); +#else + auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME, + rdb_background_psi_thread_key); #endif - ); if (err != 0) { + // NO_LINT_DEBUG sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)", err); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } - err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME -#ifdef HAVE_PSI_INTERFACE - , - rdb_drop_idx_psi_thread_key +#ifndef HAVE_PSI_INTERFACE + err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME); +#else + err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME, + rdb_drop_idx_psi_thread_key); #endif - ); if (err != 0) { + // NO_LINT_DEBUG sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)", err); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4869,7 +5284,6 @@ static int rocksdb_init_func(void *const p) { sql_print_error( "RocksDB: Couldn't start the manual compaction thread: (errno=%d)", err); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4896,7 +5310,6 @@ static int rocksdb_init_func(void *const p) { if (err != 0) { // NO_LINT_DEBUG sql_print_error("RocksDB: Couldn't initialize error messages"); - rdb_open_tables.free_hash(); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -4916,12 +5329,16 @@ static int rocksdb_init_func(void *const p) { directories.push_back(myrocks::rocksdb_wal_dir); } - io_watchdog = new Rdb_io_watchdog(directories); + io_watchdog = new Rdb_io_watchdog(std::move(directories)); io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs); // NO_LINT_DEBUG - sql_print_information("MyRocks storage engine plugin has been successfully " - "initialized."); + sql_print_information( + "MyRocks storage engine plugin has been successfully " + "initialized."); + + // Skip cleaning up rdb_open_tables as we've succeeded + rdb_open_tables_cleanup.skip(); DBUG_RETURN(HA_EXIT_SUCCESS); } @@ -4978,14 +5395,13 @@ static int rocksdb_done_func(void *const p) { "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err); } - if (rdb_open_tables.m_hash.records) { + if (rdb_open_tables.count()) { // Looks like we are getting unloaded and yet we have some open tables // left behind. error = 1; } - rdb_open_tables.free_hash(); - mysql_mutex_destroy(&rdb_open_tables.m_mutex); + rdb_open_tables.free(); mysql_mutex_destroy(&rdb_sysvars_mutex); mysql_mutex_destroy(&rdb_block_cache_resize_mutex); @@ -5051,7 +5467,7 @@ static inline void rocksdb_smart_next(bool seek_backward, } } -#ifndef NDEBUG +#ifndef DBUG_OFF // simulate that RocksDB has reported corrupted data static void dbug_change_status_to_corrupted(rocksdb::Status *status) { *status = rocksdb::Status::Corruption(); @@ -5086,40 +5502,39 @@ static inline bool is_valid(rocksdb::Iterator *scan_it) { they are needed to function. */ -Rdb_table_handler * -Rdb_open_tables_map::get_table_handler(const char *const table_name) { +Rdb_table_handler *Rdb_open_tables_map::get_table_handler( + const char *const table_name) { + DBUG_ASSERT(table_name != nullptr); + Rdb_table_handler *table_handler; - uint length; - char *tmp_name; - DBUG_ASSERT(table_name != nullptr); - length = (uint)strlen(table_name); + std::string table_name_str(table_name); // First, look up the table in the hash map. RDB_MUTEX_LOCK_CHECK(m_mutex); - if (!(table_handler = reinterpret_cast(my_hash_search( - &m_hash, reinterpret_cast(table_name), length)))) { + const auto it = m_table_map.find(table_name_str); + if (it != m_table_map.end()) { + // Found it + table_handler = it->second; + } else { + char *tmp_name; + // Since we did not find it in the hash map, attempt to create and add it // to the hash map. if (!(table_handler = reinterpret_cast(my_multi_malloc( MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler), - &tmp_name, length + 1, NullS)))) { + &tmp_name, table_name_str.length() + 1, NullS)))) { // Allocating a new Rdb_table_handler and a new table name failed. RDB_MUTEX_UNLOCK_CHECK(m_mutex); return nullptr; } table_handler->m_ref_count = 0; - table_handler->m_table_name_length = length; + table_handler->m_table_name_length = table_name_str.length(); table_handler->m_table_name = tmp_name; strmov(table_handler->m_table_name, table_name); - if (my_hash_insert(&m_hash, reinterpret_cast(table_handler))) { - // Inserting into the hash map failed. - RDB_MUTEX_UNLOCK_CHECK(m_mutex); - my_free(table_handler); - return nullptr; - } + m_table_map.emplace(table_name_str, table_handler); thr_lock_init(&table_handler->m_thr_lock); table_handler->m_io_perf_read.init(); @@ -5138,18 +5553,15 @@ std::vector rdb_get_open_table_names(void) { } std::vector Rdb_open_tables_map::get_table_names(void) const { - ulong i; const Rdb_table_handler *table_handler; std::vector names; RDB_MUTEX_LOCK_CHECK(m_mutex); - for (i = 0; (table_handler = reinterpret_cast( - my_hash_const_element(&m_hash, i))); - i++) { + for (const auto &kv : m_table_map) { + table_handler = kv.second; DBUG_ASSERT(table_handler != nullptr); names.push_back(table_handler->m_table_name); } - DBUG_ASSERT(i == m_hash.records); RDB_MUTEX_UNLOCK_CHECK(m_mutex); return names; @@ -5162,44 +5574,44 @@ std::vector Rdb_open_tables_map::get_table_names(void) const { static ulonglong rdb_get_int_col_max_value(const Field *field) { ulonglong max_value = 0; switch (field->key_type()) { - case HA_KEYTYPE_BINARY: - max_value = 0xFFULL; - break; - case HA_KEYTYPE_INT8: - max_value = 0x7FULL; - break; - case HA_KEYTYPE_USHORT_INT: - max_value = 0xFFFFULL; - break; - case HA_KEYTYPE_SHORT_INT: - max_value = 0x7FFFULL; - break; - case HA_KEYTYPE_UINT24: - max_value = 0xFFFFFFULL; - break; - case HA_KEYTYPE_INT24: - max_value = 0x7FFFFFULL; - break; - case HA_KEYTYPE_ULONG_INT: - max_value = 0xFFFFFFFFULL; - break; - case HA_KEYTYPE_LONG_INT: - max_value = 0x7FFFFFFFULL; - break; - case HA_KEYTYPE_ULONGLONG: - max_value = 0xFFFFFFFFFFFFFFFFULL; - break; - case HA_KEYTYPE_LONGLONG: - max_value = 0x7FFFFFFFFFFFFFFFULL; - break; - case HA_KEYTYPE_FLOAT: - max_value = 0x1000000ULL; - break; - case HA_KEYTYPE_DOUBLE: - max_value = 0x20000000000000ULL; - break; - default: - abort(); + case HA_KEYTYPE_BINARY: + max_value = 0xFFULL; + break; + case HA_KEYTYPE_INT8: + max_value = 0x7FULL; + break; + case HA_KEYTYPE_USHORT_INT: + max_value = 0xFFFFULL; + break; + case HA_KEYTYPE_SHORT_INT: + max_value = 0x7FFFULL; + break; + case HA_KEYTYPE_UINT24: + max_value = 0xFFFFFFULL; + break; + case HA_KEYTYPE_INT24: + max_value = 0x7FFFFFULL; + break; + case HA_KEYTYPE_ULONG_INT: + max_value = 0xFFFFFFFFULL; + break; + case HA_KEYTYPE_LONG_INT: + max_value = 0x7FFFFFFFULL; + break; + case HA_KEYTYPE_ULONGLONG: + max_value = 0xFFFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_LONGLONG: + max_value = 0x7FFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_FLOAT: + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + max_value = 0x20000000000000ULL; + break; + default: + abort(); } return max_value; @@ -5208,7 +5620,7 @@ static ulonglong rdb_get_int_col_max_value(const Field *field) { void ha_rocksdb::load_auto_incr_value() { ulonglong auto_incr = 0; bool validate_last = false, use_datadic = true; -#ifndef NDEBUG +#ifndef DBUG_OFF DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;); validate_last = true; #endif @@ -5255,7 +5667,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() { // appropriately and non-index-only lookup will not read the value) const bool save_keyread_only = m_keyread_only; m_keyread_only = true; - m_key_requested = true; + m_converter->set_is_key_requested(true); if (!index_last(table->record[0])) { Field *field = @@ -5267,7 +5679,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() { if (last_val != max_val) { last_val++; } -#ifndef NDEBUG +#ifndef DBUG_OFF ulonglong dd_val; if (last_val <= max_val) { const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id(); @@ -5392,8 +5804,9 @@ int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) { // Get hidden primary key from old key slice Rdb_string_reader reader(&rowkey_slice); - if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) + if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) { return HA_ERR_ROCKSDB_CORRUPT_DATA; + } const int length = Field_longlong::PACK_LENGTH; const uchar *from = reinterpret_cast(reader.read(length)); @@ -5422,8 +5835,8 @@ void Rdb_open_tables_map::release_table_handler( if (!--table_handler->m_ref_count) { // Last rereference was released. Tear down the hash entry. const auto ret MY_ATTRIBUTE((__unused__)) = - my_hash_delete(&m_hash, reinterpret_cast(table_handler)); - DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted + m_table_map.erase(std::string(table_handler->m_table_name)); + DBUG_ASSERT(ret == 1); // the hash entry must actually be found and deleted my_core::thr_lock_delete(&table_handler->m_thr_lock); my_free(table_handler); } @@ -5439,19 +5852,34 @@ static handler *rocksdb_create_handler(my_core::handlerton *const hton, ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, my_core::TABLE_SHARE *const table_arg) - : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr), - m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), - m_scan_it_lower_bound(nullptr), m_scan_it_upper_bound(nullptr), - m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr), - m_pk_can_be_decoded(false), m_maybe_unpack_info(false), - m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr), - m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr), - m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr), - m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr), - m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr), - m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr), - m_row_checksums_checked(0), m_in_rpl_delete_rows(false), - m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {} + : handler(hton, table_arg), + m_table_handler(nullptr), + m_scan_it(nullptr), + m_scan_it_skips_bloom(false), + m_scan_it_snapshot(nullptr), + m_scan_it_lower_bound(nullptr), + m_scan_it_upper_bound(nullptr), + m_tbl_def(nullptr), + m_pk_descr(nullptr), + m_key_descr_arr(nullptr), + m_pk_can_be_decoded(false), + m_pk_tuple(nullptr), + m_pk_packed_tuple(nullptr), + m_sk_packed_tuple(nullptr), + m_end_key_packed_tuple(nullptr), + m_sk_match_prefix(nullptr), + m_sk_match_prefix_buf(nullptr), + m_sk_packed_tuple_old(nullptr), + m_dup_sk_packed_tuple(nullptr), + m_dup_sk_packed_tuple_old(nullptr), + m_pack_buffer(nullptr), + m_lock_rows(RDB_LOCK_NONE), + m_keyread_only(false), + m_insert_with_update(false), + m_dup_pk_found(false), + m_in_rpl_delete_rows(false), + m_in_rpl_update_rows(false), + m_force_skip_unique_check(false) {} static const char *ha_rocksdb_exts[] = {NullS}; @@ -5477,9 +5905,9 @@ bool ha_rocksdb::init_with_fields() { if (pk != MAX_KEY) { const uint key_parts = table_share->key_info[pk].user_defined_key_parts; check_keyread_allowed(pk /*PK*/, key_parts - 1, true); - } else + } else { m_pk_can_be_decoded = false; - + } cached_table_flags = table_flags(); DBUG_RETURN(false); /* Ok */ @@ -5536,298 +5964,52 @@ bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd, RDB_MAX_HEXDUMP_LEN); const GL_INDEX_ID gl_index_id = kd.get_gl_index_id(); // NO_LINT_DEBUG - sql_print_error("Decoding ttl from PK value failed, " - "for index (%u,%u), val: %s", - gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); + sql_print_error( + "Decoding ttl from PK value failed, " + "for index (%u,%u), val: %s", + gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); DBUG_ASSERT(0); return false; } /* Hide record if it has expired before the current snapshot time. */ uint64 read_filter_ts = 0; -#ifndef NDEBUG +#ifndef DBUG_OFF read_filter_ts += rdb_dbug_set_ttl_read_filter_ts(); #endif bool is_hide_ttl = ts + kd.m_ttl_duration + read_filter_ts <= static_cast(curr_ts); if (is_hide_ttl) { update_row_stats(ROWS_FILTERED); + + /* increment examined row count when rows are skipped */ + THD *thd = ha_thd(); + thd->inc_examined_row_count(1); + DEBUG_SYNC(thd, "rocksdb.ttl_rows_examined"); } return is_hide_ttl; } -void ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd, - rocksdb::Iterator *const iter, - bool seek_backward) { +int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd, + rocksdb::Iterator *const iter, + bool seek_backward) { if (kd.has_ttl()) { + THD *thd = ha_thd(); while (iter->Valid() && should_hide_ttl_rec( kd, iter->value(), get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { - rocksdb_smart_next(seek_backward, iter); - } - } -} - -/** - Convert record from table->record[0] form into a form that can be written - into rocksdb. - - @param pk_packed_slice Packed PK tuple. We need it in order to compute - and store its CRC. - @param packed_rec OUT Data slice with record data. -*/ - -int ha_rocksdb::convert_record_to_storage_format( - const struct update_row_info &row_info, rocksdb::Slice *const packed_rec) { - DBUG_ASSERT_IMP(m_maybe_unpack_info, row_info.new_pk_unpack_info); - DBUG_ASSERT(m_pk_descr != nullptr); - - const rocksdb::Slice &pk_packed_slice = row_info.new_pk_slice; - Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info; - bool has_ttl = m_pk_descr->has_ttl(); - bool has_ttl_column = !m_pk_descr->m_ttl_column.empty(); - bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX); - - m_storage_record.length(0); - - if (has_ttl) { - /* If it's a TTL record, reserve space for 8 byte TTL value in front. */ - m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + m_null_bytes_in_rec, 0); - m_ttl_bytes_updated = false; - - /* - If the TTL is contained within the key, we use the offset to find the - TTL value and place it in the beginning of the value record. - */ - if (ttl_in_pk) { - Rdb_string_reader reader(&pk_packed_slice); - const char *ts; - if (!reader.read(row_info.ttl_pk_offset) || - !(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) { - std::string buf; - buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(), - RDB_MAX_HEXDUMP_LEN); - const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id(); - // NO_LINT_DEBUG - sql_print_error("Decoding ttl from PK failed during insert, " - "for index (%u,%u), key: %s", - gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); - return HA_EXIT_FAILURE; - } - - char *const data = const_cast(m_storage_record.ptr()); - memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD); -#ifndef NDEBUG - // Adjust for test case if needed - rdb_netbuf_store_uint64( - reinterpret_cast(data), - rdb_netbuf_to_uint64(reinterpret_cast(data)) + - rdb_dbug_set_ttl_rec_ts()); -#endif - // Also store in m_ttl_bytes to propagate to update_sk - memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } else if (!has_ttl_column) { - /* - For implicitly generated TTL records we need to copy over the old - TTL value from the old record in the event of an update. It was stored - in m_ttl_bytes. - - Otherwise, generate a timestamp using the current time. - */ - if (!row_info.old_pk_slice.empty()) { - char *const data = const_cast(m_storage_record.ptr()); - memcpy(data, m_ttl_bytes, sizeof(uint64)); - } else { - uint64 ts = static_cast(std::time(nullptr)); -#ifndef NDEBUG - ts += rdb_dbug_set_ttl_rec_ts(); -#endif - char *const data = const_cast(m_storage_record.ptr()); - rdb_netbuf_store_uint64(reinterpret_cast(data), ts); - // Also store in m_ttl_bytes to propagate to update_sk - memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } - } - } else { - /* All NULL bits are initially 0 */ - m_storage_record.fill(m_null_bytes_in_rec, 0); - } - - // If a primary key may have non-empty unpack_info for certain values, - // (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block - // itself was prepared in Rdb_key_def::pack_record. - if (m_maybe_unpack_info) { - m_storage_record.append(reinterpret_cast(pk_unpack_info->ptr()), - pk_unpack_info->get_current_pos()); - } - - for (uint i = 0; i < table->s->fields; i++) { - /* Don't pack decodable PK key parts */ - if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) { - continue; - } - - Field *const field = table->field[i]; - if (m_encoder_arr[i].maybe_null()) { - char *data = const_cast(m_storage_record.ptr()); - if (has_ttl) { - data += ROCKSDB_SIZEOF_TTL_RECORD; - } - - if (field->is_null()) { - data[m_encoder_arr[i].m_null_offset] |= m_encoder_arr[i].m_null_mask; - /* Don't write anything for NULL values */ - continue; - } - } - - if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) { - my_core::Field_blob *blob = (my_core::Field_blob *)field; - /* Get the number of bytes needed to store length*/ - const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr; - - /* Store the length of the value */ - m_storage_record.append(reinterpret_cast(blob->ptr), - length_bytes); - - /* Store the blob value itself */ - char *data_ptr; - memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **)); - m_storage_record.append(data_ptr, blob->get_length()); - } else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) { - Field_varstring *const field_var = (Field_varstring *)field; - uint data_len; - /* field_var->length_bytes is 1 or 2 */ - if (field_var->length_bytes == 1) { - data_len = field_var->ptr[0]; - } else { - DBUG_ASSERT(field_var->length_bytes == 2); - data_len = uint2korr(field_var->ptr); - } - m_storage_record.append(reinterpret_cast(field_var->ptr), - field_var->length_bytes + data_len); - } else { - /* Copy the field data */ - const uint len = field->pack_length_in_rec(); - m_storage_record.append(reinterpret_cast(field->ptr), len); - - /* - Check if this is the TTL field within the table, if so store the TTL - in the front of the record as well here. - */ - if (has_ttl && has_ttl_column && - i == m_pk_descr->get_ttl_field_offset()) { - DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD); - DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG); - DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX); - - char *const data = const_cast(m_storage_record.ptr()); - uint64 ts = uint8korr(field->ptr); -#ifndef NDEBUG - ts += rdb_dbug_set_ttl_rec_ts(); -#endif - rdb_netbuf_store_uint64(reinterpret_cast(data), ts); - - // If this is an update and the timestamp has been updated, take note - // so we can avoid updating SKs unnecessarily. - if (!row_info.old_pk_slice.empty()) { - m_ttl_bytes_updated = - memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } - // Store timestamp in m_ttl_bytes to propagate to update_sk - memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); + DEBUG_SYNC(thd, "rocksdb.check_flags_ser"); + if (thd && thd->killed) { + return HA_ERR_QUERY_INTERRUPTED; } + rocksdb_smart_next(seek_backward, iter); } } - - if (should_store_row_debug_checksums()) { - const uint32_t key_crc32 = my_core::crc32( - 0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size()); - const uint32_t val_crc32 = - my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record), - m_storage_record.length()); - uchar key_crc_buf[RDB_CHECKSUM_SIZE]; - uchar val_crc_buf[RDB_CHECKSUM_SIZE]; - rdb_netbuf_store_uint32(key_crc_buf, key_crc32); - rdb_netbuf_store_uint32(val_crc_buf, val_crc32); - m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, 1); - m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE); - m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE); - } - - *packed_rec = - rocksdb::Slice(m_storage_record.ptr(), m_storage_record.length()); - return HA_EXIT_SUCCESS; } -/* - @brief - Setup which fields will be unpacked when reading rows - - @detail - Three special cases when we still unpack all fields: - - When this table is being updated (m_lock_rows==RDB_LOCK_WRITE). - - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to - read all fields to find whether there is a row checksum at the end. We could - skip the fields instead of decoding them, but currently we do decoding.) - - On index merge as bitmap is cleared during that operation - - @seealso - ha_rocksdb::setup_field_converters() - ha_rocksdb::convert_record_from_storage_format() -*/ -void ha_rocksdb::setup_read_decoders() { - m_decoders_vect.clear(); - m_key_requested = false; - - int last_useful = 0; - int skip_size = 0; - - for (uint i = 0; i < table->s->fields; i++) { - // bitmap is cleared on index merge, but it still needs to decode columns - const bool field_requested = - m_lock_rows == RDB_LOCK_WRITE || m_verify_row_debug_checksums || - bitmap_is_clear_all(table->read_set) || - bitmap_is_set(table->read_set, table->field[i]->field_index); - - // We only need the decoder if the whole record is stored. - if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) { - // the field potentially needs unpacking - if (field_requested) { - // the field is in the read set - m_key_requested = true; - } - continue; - } - - if (field_requested) { - // We will need to decode this field - m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size}); - last_useful = m_decoders_vect.size(); - skip_size = 0; - } else { - if (m_encoder_arr[i].uses_variable_len_encoding() || - m_encoder_arr[i].maybe_null()) { - // For variable-length field, we need to read the data and skip it - m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size}); - skip_size = 0; - } else { - // Fixed-width field can be skipped without looking at it. - // Add appropriate skip_size to the next field. - skip_size += m_encoder_arr[i].m_pack_length_in_rec; - } - } - } - - // It could be that the last few elements are varchars that just do - // skipping. Remove them. - m_decoders_vect.erase(m_decoders_vect.begin() + last_useful, - m_decoders_vect.end()); -} - -#ifndef NDEBUG +#ifndef DBUG_OFF void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) { std::string str(on_disk_rec->data(), on_disk_rec->size()); on_disk_rec->Reset(); @@ -5852,17 +6034,6 @@ void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) { on_disk_rec->PinSelf(rocksdb::Slice(res)); } -void dbug_modify_key_varchar8(String &on_disk_rec) { - std::string res; - // The key starts with index number - res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE); - - // Then, a mem-comparable form of a varchar(8) value. - res.append("ABCDE\0\0\0\xFC", 9); - on_disk_rec.length(0); - on_disk_rec.append(res.data(), res.size()); -} - void dbug_create_err_inplace_alter() { my_printf_error(ER_UNKNOWN_ERROR, "Intentional failure in inplace alter occurred.", MYF(0)); @@ -5871,7 +6042,6 @@ void dbug_create_err_inplace_alter() { int ha_rocksdb::convert_record_from_storage_format( const rocksdb::Slice *const key, uchar *const buf) { - DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1", dbug_append_garbage_at_end(&m_retrieved_record);); DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2", @@ -5882,92 +6052,6 @@ int ha_rocksdb::convert_record_from_storage_format( return convert_record_from_storage_format(key, &m_retrieved_record, buf); } -int ha_rocksdb::convert_blob_from_storage_format( - my_core::Field_blob *const blob, - Rdb_string_reader *const reader, - bool decode) -{ - /* Get the number of bytes needed to store length*/ - const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr; - - const char *data_len_str; - if (!(data_len_str = reader->read(length_bytes))) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - memcpy(blob->ptr, data_len_str, length_bytes); - - const uint32 data_len = blob->get_length( - reinterpret_cast(data_len_str), length_bytes, - table->s->db_low_byte_first); - const char *blob_ptr; - if (!(blob_ptr = reader->read(data_len))) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - if (decode) { - // set 8-byte pointer to 0, like innodb does (relevant for 32-bit - // platforms) - memset(blob->ptr + length_bytes, 0, 8); - memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **)); - } - - return HA_EXIT_SUCCESS; -} - -int ha_rocksdb::convert_varchar_from_storage_format( - my_core::Field_varstring *const field_var, - Rdb_string_reader *const reader, - bool decode) -{ - const char *data_len_str; - if (!(data_len_str = reader->read(field_var->length_bytes))) - return HA_ERR_ROCKSDB_CORRUPT_DATA; - - uint data_len; - /* field_var->length_bytes is 1 or 2 */ - if (field_var->length_bytes == 1) { - data_len = (uchar)data_len_str[0]; - } else { - DBUG_ASSERT(field_var->length_bytes == 2); - data_len = uint2korr(data_len_str); - } - - if (data_len > field_var->field_length) { - /* The data on disk is longer than table DDL allows? */ - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - if (!reader->read(data_len)) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - if (decode) { - memcpy(field_var->ptr, data_len_str, field_var->length_bytes + data_len); - } - - return HA_EXIT_SUCCESS; -} - -int ha_rocksdb::convert_field_from_storage_format( - my_core::Field *const field, - Rdb_string_reader *const reader, - bool decode, - uint len) -{ - const char *data_bytes; - if (len > 0) { - if ((data_bytes = reader->read(len)) == nullptr) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - if (decode) - memcpy(field->ptr, data_bytes, len); - } - - return HA_EXIT_SUCCESS; -} - /* @brief Unpack the record in this->m_retrieved_record and this->m_last_rowkey from @@ -5984,8 +6068,8 @@ int ha_rocksdb::convert_field_from_storage_format( m_retrieved_record). @seealso - ha_rocksdb::setup_read_decoders() Sets up data structures which tell which - columns to decode. + rdb_converter::setup_read_decoders() Sets up data structures which tell + which columns to decode. @return 0 OK @@ -5995,241 +6079,7 @@ int ha_rocksdb::convert_field_from_storage_format( int ha_rocksdb::convert_record_from_storage_format( const rocksdb::Slice *const key, const rocksdb::Slice *const value, uchar *const buf) { - Rdb_string_reader reader(value); - - /* - Decode PK fields from the key - */ - DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1", - dbug_modify_key_varchar8(m_last_rowkey);); - - const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), - m_last_rowkey.length()); - const char *unpack_info = nullptr; - uint16 unpack_info_len = 0; - rocksdb::Slice unpack_slice; - - /* If it's a TTL record, skip the 8 byte TTL value */ - const char *ttl_bytes; - if (m_pk_descr->has_ttl()) { - if ((ttl_bytes = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) { - memcpy(m_ttl_bytes, ttl_bytes, ROCKSDB_SIZEOF_TTL_RECORD); - } else { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - } - - /* Other fields are decoded from the value */ - const char *null_bytes = nullptr; - if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - if (m_maybe_unpack_info) { - unpack_info = reader.get_current_ptr(); - if (!unpack_info || !Rdb_key_def::is_unpack_data_tag(unpack_info[0]) || - !reader.read(Rdb_key_def::get_unpack_header_size(unpack_info[0]))) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - unpack_info_len = - rdb_netbuf_to_uint16(reinterpret_cast(unpack_info + 1)); - unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len); - - reader.read(unpack_info_len - - Rdb_key_def::get_unpack_header_size(unpack_info[0])); - } - - int err = HA_EXIT_SUCCESS; - if (m_key_requested) { - err = m_pk_descr->unpack_record(table, buf, &rowkey_slice, - unpack_info ? &unpack_slice : nullptr, - false /* verify_checksum */); - } - - if (err != HA_EXIT_SUCCESS) { - return err; - } - - for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it++) { - const Rdb_field_encoder *const field_dec = it->m_field_enc; - const bool decode = it->m_decode; - const bool isNull = - field_dec->maybe_null() && - ((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0); - - Field *const field = table->field[field_dec->m_field_index]; - - /* Skip the bytes we need to skip */ - if (it->m_skip && !reader.read(it->m_skip)) { - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - uint field_offset = field->ptr - table->record[0]; - uint null_offset = field->null_offset(); - bool maybe_null = field->real_maybe_null(); - field->move_field(buf + field_offset, - maybe_null ? buf + null_offset : nullptr, - field->null_bit); - // WARNING! - Don't return before restoring field->ptr and field->null_ptr! - - if (isNull) { - if (decode) { - /* This sets the NULL-bit of this record */ - field->set_null(); - /* - Besides that, set the field value to default value. CHECKSUM TABLE - depends on this. - */ - memcpy(field->ptr, table->s->default_values + field_offset, - field->pack_length()); - } - } else { - if (decode) { - field->set_notnull(); - } - - if (field_dec->m_field_type == MYSQL_TYPE_BLOB) { - err = convert_blob_from_storage_format( - (my_core::Field_blob *) field, &reader, decode); - } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) { - err = convert_varchar_from_storage_format( - (my_core::Field_varstring *) field, &reader, decode); - } else { - err = convert_field_from_storage_format( - field, &reader, decode, field_dec->m_pack_length_in_rec); - } - } - - // Restore field->ptr and field->null_ptr - field->move_field(table->record[0] + field_offset, - maybe_null ? table->record[0] + null_offset : nullptr, - field->null_bit); - - if (err != HA_EXIT_SUCCESS) { - return err; - } - } - - if (m_verify_row_debug_checksums) { - if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE && - reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG) { - uint32_t stored_key_chksum = - rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE)); - uint32_t stored_val_chksum = - rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE)); - - const uint32_t computed_key_chksum = - my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size()); - const uint32_t computed_val_chksum = - my_core::crc32(0, rdb_slice_to_uchar_ptr(value), - value->size() - RDB_CHECKSUM_CHUNK_SIZE); - - DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1", - stored_key_chksum++;); - - if (stored_key_chksum != computed_key_chksum) { - m_pk_descr->report_checksum_mismatch(true, key->data(), key->size()); - return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH; - } - - DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2", - stored_val_chksum++;); - if (stored_val_chksum != computed_val_chksum) { - m_pk_descr->report_checksum_mismatch(false, value->data(), - value->size()); - return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH; - } - - m_row_checksums_checked++; - } - if (reader.remaining_bytes()) - return HA_ERR_ROCKSDB_CORRUPT_DATA; - } - - return HA_EXIT_SUCCESS; -} - -void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder, - const uint &kp) { - // STORE_SOME uses unpack_info. - if (m_pk_descr->has_unpack_info(kp)) { - DBUG_ASSERT(m_pk_descr->can_unpack(kp)); - encoder->m_storage_type = Rdb_field_encoder::STORE_SOME; - m_maybe_unpack_info = true; - } else if (m_pk_descr->can_unpack(kp)) { - encoder->m_storage_type = Rdb_field_encoder::STORE_NONE; - } -} - -/* - Setup data needed to convert table->record[] to and from record storage - format. - - @seealso - ha_rocksdb::convert_record_to_storage_format, - ha_rocksdb::convert_record_from_storage_format -*/ - -void ha_rocksdb::setup_field_converters() { - uint i; - uint null_bytes = 0; - uchar cur_null_mask = 0x1; - - DBUG_ASSERT(m_encoder_arr == nullptr); - m_encoder_arr = static_cast( - my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0))); - if (m_encoder_arr == nullptr) { - return; - } - - for (i = 0; i < table->s->fields; i++) { - Field *const field = table->field[i]; - m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL; - - /* - Check if this field is - - a part of primary key, and - - it can be decoded back from its key image. - If both hold, we don't need to store this field in the value part of - RocksDB's key-value pair. - - If hidden pk exists, we skip this check since the field will never be - part of the hidden pk. - */ - if (!has_hidden_pk(table)) { - KEY *const pk_info = &table->key_info[table->s->primary_key]; - for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) { - /* key_part->fieldnr is counted from 1 */ - if (field->field_index + 1 == pk_info->key_part[kp].fieldnr) { - get_storage_type(&m_encoder_arr[i], kp); - break; - } - } - } - - m_encoder_arr[i].m_field_type = field->real_type(); - m_encoder_arr[i].m_field_index = i; - m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec(); - - if (field->real_maybe_null()) { - m_encoder_arr[i].m_null_mask = cur_null_mask; - m_encoder_arr[i].m_null_offset = null_bytes; - if (cur_null_mask == 0x80) { - cur_null_mask = 0x1; - null_bytes++; - } else - cur_null_mask = cur_null_mask << 1; - } else { - m_encoder_arr[i].m_null_mask = 0; - } - } - - /* Count the last, unfinished NULL-bits byte */ - if (cur_null_mask != 0x1) - null_bytes++; - - m_null_bytes_in_rec = null_bytes; + return m_converter->decode(m_pk_descr, buf, key, value); } int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, @@ -6266,8 +6116,8 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */ max_packed_sk_len = pack_key_len; for (uint i = 0; i < table_arg->s->keys; i++) { - if (i == table_arg->s->primary_key) /* Primary key was processed above */ - continue; + /* Primary key was processed above */ + if (i == table_arg->s->primary_key) continue; // TODO: move this into get_table_handler() ?? kd_arr[i]->setup(table_arg, tbl_def_arg); @@ -6355,24 +6205,9 @@ void ha_rocksdb::free_key_buffers() { m_scan_it_upper_bound = nullptr; } -void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) { - DBUG_ASSERT(whitelist != nullptr); - -#if defined(HAVE_PSI_INTERFACE) - Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables); -#else - Regex_list_handler regex_handler; -#endif - - if (!regex_handler.set_patterns(whitelist)) { - warn_about_bad_patterns(®ex_handler, "read_free_rpl_tables"); - } - - m_use_read_free_rpl = regex_handler.matches(m_tbl_def->base_tablename()); -} - void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) { - DBUG_ASSERT(whitelist != nullptr); + const char *const wl = + whitelist ? whitelist : DEFAULT_SKIP_UNIQUE_CHECK_TABLES; #if defined(HAVE_PSI_INTERFACE) Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables); @@ -6380,7 +6215,7 @@ void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) { Regex_list_handler regex_handler; #endif - if (!regex_handler.set_patterns(whitelist)) { + if (!regex_handler.set_patterns(wl)) { warn_about_bad_patterns(®ex_handler, "skip_unique_check_tables"); } @@ -6427,8 +6262,8 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { "dictionary"); DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE); } - m_lock_rows = RDB_LOCK_NONE; + m_lock_rows = RDB_LOCK_NONE; m_key_descr_arr = m_tbl_def->m_key_descr_arr; /* @@ -6457,7 +6292,15 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { */ init_with_fields(); - setup_field_converters(); + /* Initialize decoder */ + m_converter = std::make_shared(ha_thd(), m_tbl_def, table); + + /* + Update m_ttl_bytes address to same as Rdb_converter's m_ttl_bytes. + Remove this code after moving convert_record_to_storage_format() into + Rdb_converter class. + */ + m_ttl_bytes = m_converter->get_ttl_bytes_buffer(); info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); @@ -6465,11 +6308,10 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { The following load_XXX code calls row decode functions, and they do that without having done ::external_lock() or index_init()/rnd_init(). (Note: this also means we're doing a read when there was no - setup_field_converters() call) + rdb_converter::setup_field_encoders() call) Initialize the necessary variables for them: */ - m_verify_row_debug_checksums = false; /* Load auto_increment value only once on first use. */ if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) { @@ -6486,9 +6328,6 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { /* Index block size in MyRocks: used by MySQL in query optimization */ stats.block_size = rocksdb_tbl_options->block_size; - /* Determine at open whether we can use Read Free Replication or not */ - set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables)); - /* Determine at open whether we should skip unique checks for this table */ set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables)); @@ -6500,19 +6339,15 @@ int ha_rocksdb::close(void) { m_pk_descr = nullptr; m_key_descr_arr = nullptr; - + m_converter = nullptr; free_key_buffers(); - my_free(m_encoder_arr); - m_encoder_arr = nullptr; - if (m_table_handler != nullptr) { rdb_open_tables.release_table_handler(m_table_handler); m_table_handler = nullptr; } // These are needed to suppress valgrind errors in rocksdb.partition - m_storage_record.free(); m_last_rowkey.free(); m_sk_tails.free(); m_sk_tails_old.free(); @@ -6597,61 +6432,72 @@ int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, int err; switch (s.code()) { - case rocksdb::Status::Code::kOk: - err = HA_EXIT_SUCCESS; - break; - case rocksdb::Status::Code::kNotFound: - err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND; - break; - case rocksdb::Status::Code::kCorruption: - err = HA_ERR_ROCKSDB_STATUS_CORRUPTION; - break; - case rocksdb::Status::Code::kNotSupported: - err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED; - break; - case rocksdb::Status::Code::kInvalidArgument: - err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT; - break; - case rocksdb::Status::Code::kIOError: - err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE - : HA_ERR_ROCKSDB_STATUS_IO_ERROR; - break; - case rocksdb::Status::Code::kMergeInProgress: - err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS; - break; - case rocksdb::Status::Code::kIncomplete: - err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE; - break; - case rocksdb::Status::Code::kShutdownInProgress: - err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS; - break; - case rocksdb::Status::Code::kTimedOut: - err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT; - break; - case rocksdb::Status::Code::kAborted: - err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT - : HA_ERR_ROCKSDB_STATUS_ABORTED; - break; - case rocksdb::Status::Code::kBusy: - err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK - : HA_ERR_ROCKSDB_STATUS_BUSY; - break; - case rocksdb::Status::Code::kExpired: - err = HA_ERR_ROCKSDB_STATUS_EXPIRED; - break; - case rocksdb::Status::Code::kTryAgain: - err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN; - break; - default: - DBUG_ASSERT(0); - return -1; + case rocksdb::Status::Code::kOk: + err = HA_EXIT_SUCCESS; + break; + case rocksdb::Status::Code::kNotFound: + err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND; + break; + case rocksdb::Status::Code::kCorruption: + err = HA_ERR_ROCKSDB_STATUS_CORRUPTION; + break; + case rocksdb::Status::Code::kNotSupported: + err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED; + break; + case rocksdb::Status::Code::kInvalidArgument: + err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT; + break; + case rocksdb::Status::Code::kIOError: + err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE + : HA_ERR_ROCKSDB_STATUS_IO_ERROR; + break; + case rocksdb::Status::Code::kMergeInProgress: + err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS; + break; + case rocksdb::Status::Code::kIncomplete: + err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE; + break; + case rocksdb::Status::Code::kShutdownInProgress: + err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS; + break; + case rocksdb::Status::Code::kTimedOut: + err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT; + break; + case rocksdb::Status::Code::kAborted: + err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT + : HA_ERR_ROCKSDB_STATUS_ABORTED; + break; + case rocksdb::Status::Code::kBusy: + err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK + : HA_ERR_ROCKSDB_STATUS_BUSY; + break; + case rocksdb::Status::Code::kExpired: + err = HA_ERR_ROCKSDB_STATUS_EXPIRED; + break; + case rocksdb::Status::Code::kTryAgain: + err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN; + break; + default: + DBUG_ASSERT(0); + return -1; + } + + std::string errMsg; + if (s.IsLockLimit()) { + errMsg = + "Operation aborted: Failed to acquire lock due to " + "rocksdb_max_row_locks limit"; + } else { + errMsg = s.ToString(); } if (opt_msg) { - std::string concatenated_error = s.ToString() + " (" + std::string(opt_msg) + ")"; - my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(), rocksdb_hton_name); + std::string concatenated_error = errMsg + " (" + std::string(opt_msg) + ")"; + my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(), + rocksdb_hton_name); } else { - my_error(ER_GET_ERRMSG, MYF(0), s.code(), s.ToString().c_str(), rocksdb_hton_name); + my_error(ER_GET_ERRMSG, MYF(0), s.code(), errMsg.c_str(), + rocksdb_hton_name); } return err; @@ -6661,8 +6507,8 @@ int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, static const std::set RDB_INDEX_COLLATIONS = { &my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin}; -static bool -rdb_is_index_collation_supported(const my_core::Field *const field) { +static bool rdb_is_index_collation_supported( + const my_core::Field *const field) { const my_core::enum_field_types type = field->real_type(); /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */ if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING || @@ -6698,8 +6544,6 @@ int ha_rocksdb::create_key_defs( DBUG_ASSERT(table_arg->s != nullptr); - uint i; - /* These need to be one greater than MAX_INDEXES since the user can create MAX_INDEXES secondary keys and no primary key which would cause us @@ -6716,6 +6560,36 @@ int ha_rocksdb::create_key_defs( DBUG_RETURN(HA_EXIT_FAILURE); } + uint64 ttl_duration = 0; + std::string ttl_column; + uint ttl_field_offset; + + uint err; + if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg, + &ttl_duration))) { + DBUG_RETURN(err); + } + + if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column, + &ttl_field_offset))) { + DBUG_RETURN(err); + } + + /* We don't currently support TTL on tables with hidden primary keys. */ + if (ttl_duration > 0 && has_hidden_pk(table_arg)) { + my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0)); + DBUG_RETURN(HA_EXIT_FAILURE); + } + + /* + If TTL duration is not specified but TTL column was specified, throw an + error because TTL column requires duration. + */ + if (ttl_duration == 0 && !ttl_column.empty()) { + my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str()); + DBUG_RETURN(HA_EXIT_FAILURE); + } + if (!old_tbl_def_arg) { /* old_tbl_def doesn't exist. this means we are in the process of creating @@ -6724,9 +6598,9 @@ int ha_rocksdb::create_key_defs( Get the index numbers (this will update the next_index_number) and create Rdb_key_def structures. */ - for (i = 0; i < tbl_def_arg->m_key_count; i++) { - if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], - cfs[i])) { + for (uint i = 0; i < tbl_def_arg->m_key_count; i++) { + if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], cfs[i], + ttl_duration, ttl_column)) { DBUG_RETURN(HA_EXIT_FAILURE); } } @@ -6737,7 +6611,8 @@ int ha_rocksdb::create_key_defs( generate the necessary new key definitions if any. */ if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg, - old_tbl_def_arg, cfs)) { + old_tbl_def_arg, cfs, ttl_duration, + ttl_column)) { DBUG_RETURN(HA_EXIT_FAILURE); } } @@ -6822,8 +6697,8 @@ int ha_rocksdb::create_cfs( // Generate the name for the column family to use. bool per_part_match_found = false; - std::string cf_name = generate_cf_name(i, table_arg, tbl_def_arg, - &per_part_match_found); + std::string cf_name = + generate_cf_name(i, table_arg, tbl_def_arg, &per_part_match_found); // Prevent create from using the system column family. if (cf_name == DEFAULT_SYSTEM_CF_NAME) { @@ -6868,7 +6743,8 @@ int ha_rocksdb::create_cfs( int ha_rocksdb::create_inplace_key_defs( const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg, - const std::array &cfs) const { + const std::array &cfs, + uint64 ttl_duration, const std::string &ttl_column) const { DBUG_ENTER_FUNC(); std::shared_ptr *const old_key_descr = @@ -6894,10 +6770,11 @@ int ha_rocksdb::create_inplace_key_defs( struct Rdb_index_info index_info; if (!dict_manager.get_index_info(gl_index_id, &index_info)) { // NO_LINT_DEBUG - sql_print_error("RocksDB: Could not get index information " - "for Index Number (%u,%u), table %s", - gl_index_id.cf_id, gl_index_id.index_id, - old_tbl_def_arg->full_tablename().c_str()); + sql_print_error( + "RocksDB: Could not get index information " + "for Index Number (%u,%u), table %s", + gl_index_id.cf_id, gl_index_id.index_id, + old_tbl_def_arg->full_tablename().c_str()); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -6921,7 +6798,7 @@ int ha_rocksdb::create_inplace_key_defs( dict_manager.get_stats(gl_index_id), index_info.m_index_flags, ttl_rec_offset, index_info.m_ttl_duration); } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i], - cfs[i])) { + cfs[i], ttl_duration, ttl_column)) { DBUG_RETURN(HA_EXIT_FAILURE); } @@ -7071,44 +6948,16 @@ int ha_rocksdb::compare_key_parts(const KEY *const old_key, 0 - Ok other - error, either given table ddl is not supported by rocksdb or OOM. */ -int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, +int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint i, const Rdb_tbl_def *const tbl_def_arg, std::shared_ptr *const new_key_def, - const struct key_def_cf_info &cf_info) const { + const struct key_def_cf_info &cf_info, + uint64 ttl_duration, + const std::string &ttl_column) const { DBUG_ENTER_FUNC(); DBUG_ASSERT(*new_key_def == nullptr); - uint64 ttl_duration = 0; - std::string ttl_column; - uint ttl_field_offset; - - uint err; - if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg, - &ttl_duration))) { - DBUG_RETURN(err); - } - - if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column, - &ttl_field_offset))) { - DBUG_RETURN(err); - } - - /* We don't currently support TTL on tables with hidden primary keys. */ - if (ttl_duration > 0 && is_hidden_pk(i, table_arg, tbl_def_arg)) { - my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0)); - DBUG_RETURN(HA_EXIT_FAILURE); - } - - /* - If TTL duration is not specified but TTL column was specified, throw an - error because TTL column requires duration. - */ - if (ttl_duration == 0 && !ttl_column.empty()) { - my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str()); - DBUG_RETURN(HA_EXIT_FAILURE); - } - const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager); const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST; uchar index_type; @@ -7157,20 +7006,21 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, if (!ttl_column.empty()) { (*new_key_def)->m_ttl_column = ttl_column; } - + // initialize key_def + (*new_key_def)->setup(table_arg, tbl_def_arg); DBUG_RETURN(HA_EXIT_SUCCESS); } int rdb_normalize_tablename(const std::string &tablename, std::string *const strbuf) { if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') { - DBUG_ASSERT(0); // We were not passed table name? + DBUG_ASSERT(0); // We were not passed table name? return HA_ERR_ROCKSDB_INVALID_TABLE; } size_t pos = tablename.find_first_of('/', 2); if (pos == std::string::npos) { - DBUG_ASSERT(0); // We were not passed table name? + DBUG_ASSERT(0); // We were not passed table name? return HA_ERR_ROCKSDB_INVALID_TABLE; } @@ -7262,31 +7112,122 @@ int rdb_split_normalized_tablename(const std::string &fullname, *db = fullname.substr(0, dotpos); } - dotpos++; - - const size_t partpos = - fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR)); + dotpos++; + + const size_t partpos = + fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR)); + + if (partpos != std::string::npos) { + DBUG_ASSERT(partpos >= dotpos); + + if (table != nullptr) { + *table = fullname.substr(dotpos, partpos - dotpos); + } + + if (partition != nullptr) { + *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR)); + } + } else if (table != nullptr) { + *table = fullname.substr(dotpos); + } + + return HA_EXIT_SUCCESS; +} + +/* + Create a table's Rdb_tbl_def and its Rdb_key_defs and store table information + into MyRocks Data Dictionary + The method is called during create table/partition, truncate table/partition + + @param table_name IN table's name formated as + 'dbname.tablename' + @param table_arg IN sql table + @param auto_increment_value IN specified table's auto increment value + + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ +int ha_rocksdb::create_table(const std::string &table_name, + const TABLE *table_arg, + ulonglong auto_increment_value) { + DBUG_ENTER_FUNC(); + + int err; + + const std::unique_ptr wb = dict_manager.begin(); + rocksdb::WriteBatch *const batch = wb.get(); + + /* Create table/key descriptions and put them into the data dictionary */ + m_tbl_def = new Rdb_tbl_def(table_name); + + uint n_keys = table_arg->s->keys; + + /* + If no primary key found, create a hidden PK and place it inside table + definition + */ + if (has_hidden_pk(table_arg)) { + n_keys += 1; + // reset hidden pk id + // the starting valid value for hidden pk is 1 + m_tbl_def->m_hidden_pk_val = 1; + } + + m_key_descr_arr = new std::shared_ptr[n_keys]; + m_tbl_def->m_key_count = n_keys; + m_tbl_def->m_key_descr_arr = m_key_descr_arr; + + err = create_key_defs(table_arg, m_tbl_def); + if (err != HA_EXIT_SUCCESS) { + goto error; + } + + m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)]; + + if (auto_increment_value) { + bool autoinc_upgrade_test = false; + m_tbl_def->m_auto_incr_val = auto_increment_value; + DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;); + if (!autoinc_upgrade_test) { + auto s = dict_manager.put_auto_incr_val( + batch, m_tbl_def->get_autoincr_gl_index_id(), + m_tbl_def->m_auto_incr_val); + if (!s.ok()) { + goto error; + } + } + } + + dict_manager.lock(); + err = ddl_manager.put_and_write(m_tbl_def, batch); + if (err != HA_EXIT_SUCCESS) { + dict_manager.unlock(); + goto error; + } + + err = dict_manager.commit(batch); + if (err != HA_EXIT_SUCCESS) { + dict_manager.unlock(); + goto error; + } - if (partpos != std::string::npos) { - DBUG_ASSERT(partpos >= dotpos); + dict_manager.unlock(); - if (table != nullptr) { - *table = fullname.substr(dotpos, partpos - dotpos); - } + DBUG_RETURN(HA_EXIT_SUCCESS); - if (partition != nullptr) { - *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR)); - } - } else if (table != nullptr) { - *table = fullname.substr(dotpos); - } +error: + /* Delete what we have allocated so far */ + delete m_tbl_def; + m_tbl_def = nullptr; + m_key_descr_arr = nullptr; - return HA_EXIT_SUCCESS; + DBUG_RETURN(err); } /** @brief - create() is called to create a database. The variable name will have the name + create() is called to create a table. The variable name will have the name of the table. @details @@ -7327,108 +7268,41 @@ int ha_rocksdb::create(const char *const name, TABLE *const table_arg, DBUG_RETURN(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED); } - std::string str; int err; - - THD *const thd = my_core::thd_get_current_thd(); - - if (get_table_if_exists(name)) { - if (thd->lex->sql_command == SQLCOM_TRUNCATE) { - err = delete_table(name); - if (err != HA_EXIT_SUCCESS) { - DBUG_RETURN(err); - } - } else { - err = rdb_normalize_tablename(name, &str); - if (err != HA_EXIT_SUCCESS) { - DBUG_RETURN(err); - } - my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name); - DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); - } - } - /* Construct dbname.tablename ourselves, because parititioning passes strings like "./test/t14#P#p0" for individual partitions, while table_arg->s->table_name has none of that. */ + std::string str; err = rdb_normalize_tablename(name, &str); if (err != HA_EXIT_SUCCESS) { DBUG_RETURN(err); } + // FOREIGN KEY isn't supported yet + THD *const thd = my_core::thd_get_current_thd(); if (contains_foreign_key(thd)) { my_error(ER_NOT_SUPPORTED_YET, MYF(0), "FOREIGN KEY for the RocksDB storage engine"); DBUG_RETURN(HA_ERR_UNSUPPORTED); } - const std::unique_ptr wb = dict_manager.begin(); - rocksdb::WriteBatch *const batch = wb.get(); - - /* Create table/key descriptions and put them into the data dictionary */ - m_tbl_def = new Rdb_tbl_def(str); - - uint n_keys = table_arg->s->keys; - - /* - If no primary key found, create a hidden PK and place it inside table - definition - */ - if (has_hidden_pk(table_arg)) { - n_keys += 1; - } - - m_key_descr_arr = new std::shared_ptr[n_keys]; - m_tbl_def->m_key_count = n_keys; - m_tbl_def->m_key_descr_arr = m_key_descr_arr; - - err = create_key_defs(table_arg, m_tbl_def); - if (err != HA_EXIT_SUCCESS) { - goto error; - } - - m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)]; - - if (create_info->auto_increment_value) { - bool autoinc_upgrade_test = false; - m_tbl_def->m_auto_incr_val = create_info->auto_increment_value; - DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;); - if (!autoinc_upgrade_test) { - auto s = dict_manager.put_auto_incr_val( - batch, m_tbl_def->get_autoincr_gl_index_id(), - m_tbl_def->m_auto_incr_val); - if (!s.ok()) { - goto error; + // Check whether Data Dictionary contain information + Rdb_tbl_def *tbl = ddl_manager.find(str); + if (tbl != nullptr) { + if (thd->lex->sql_command == SQLCOM_TRUNCATE) { + err = delete_table(tbl); + if (err != HA_EXIT_SUCCESS) { + DBUG_RETURN(err); } + } else { + my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name); + DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); } } - dict_manager.lock(); - err = ddl_manager.put_and_write(m_tbl_def, batch); - if (err != HA_EXIT_SUCCESS) { - dict_manager.unlock(); - goto error; - } - - err = dict_manager.commit(batch); - if (err != HA_EXIT_SUCCESS) { - dict_manager.unlock(); - goto error; - } - - dict_manager.unlock(); - - DBUG_RETURN(HA_EXIT_SUCCESS); - -error: - /* Delete what we have allocated so far */ - delete m_tbl_def; - m_tbl_def = nullptr; - m_key_descr_arr = nullptr; - - DBUG_RETURN(err); + DBUG_RETURN(create_table(str, table_arg, create_info->auto_increment_value)); } /** @@ -7474,9 +7348,10 @@ bool ha_rocksdb::check_keyread_allowed(uint inx, uint part, int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter, - const bool &full_key_match, + const bool /* unused */, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { + THD *thd = ha_thd(); /* We are looking for the first record such that index_tuple= lookup_tuple. @@ -7485,6 +7360,9 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice); while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) { + if (thd && thd->killed) { + return HA_ERR_QUERY_INTERRUPTED; + } /* If TTL is enabled we need to check if the given key has already expired from the POV of the current transaction. If it has, try going to the next @@ -7506,9 +7384,10 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, } int ha_rocksdb::read_before_key(const Rdb_key_def &kd, - const bool &full_key_match, + const bool full_key_match, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { + THD *thd = ha_thd(); /* We are looking for record with the biggest t.key such that t.key < lookup_tuple. @@ -7516,6 +7395,9 @@ int ha_rocksdb::read_before_key(const Rdb_key_def &kd, rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice); while (is_valid(m_scan_it)) { + if (thd && thd->killed) { + return HA_ERR_QUERY_INTERRUPTED; + } /* We are using full key and we've hit an exact match, or... @@ -7540,6 +7422,7 @@ int ha_rocksdb::read_before_key(const Rdb_key_def &kd, int ha_rocksdb::read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { + THD *thd = ha_thd(); /* We are looking for the first record such that @@ -7557,6 +7440,9 @@ int ha_rocksdb::read_after_key(const Rdb_key_def &kd, */ while (is_valid(m_scan_it) && kd.has_ttl() && should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) { + if (thd && thd->killed) { + return HA_ERR_QUERY_INTERRUPTED; + } rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it); } @@ -7565,7 +7451,7 @@ int ha_rocksdb::read_after_key(const Rdb_key_def &kd, int ha_rocksdb::position_to_correct_key( const Rdb_key_def &kd, const enum ha_rkey_function &find_flag, - const bool &full_key_match, const uchar *const key, + const bool full_key_match, const uchar *const key, const key_part_map &keypart_map, const rocksdb::Slice &key_slice, bool *const move_forward, const int64_t ttl_filter_ts) { int rc = 0; @@ -7573,65 +7459,66 @@ int ha_rocksdb::position_to_correct_key( *move_forward = true; switch (find_flag) { - case HA_READ_KEY_EXACT: - rc = - read_key_exact(kd, m_scan_it, full_key_match, key_slice, ttl_filter_ts); - break; - case HA_READ_BEFORE_KEY: - *move_forward = false; - rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); - if (rc == 0 && !kd.covers_key(m_scan_it->key())) { - /* The record we've got is not from this index */ - rc = HA_ERR_KEY_NOT_FOUND; - } - break; - case HA_READ_AFTER_KEY: - case HA_READ_KEY_OR_NEXT: - rc = read_after_key(kd, key_slice, ttl_filter_ts); - if (rc == 0 && !kd.covers_key(m_scan_it->key())) { - /* The record we've got is not from this index */ - rc = HA_ERR_KEY_NOT_FOUND; - } - break; - case HA_READ_KEY_OR_PREV: - case HA_READ_PREFIX: - /* This flag is not used by the SQL layer, so we don't support it yet. */ - rc = HA_ERR_UNSUPPORTED; - break; - case HA_READ_PREFIX_LAST: - case HA_READ_PREFIX_LAST_OR_PREV: - *move_forward = false; - /* - Find the last record with the specified index prefix lookup. - - HA_READ_PREFIX_LAST requires that the record has the - prefix=lookup (if there are no such records, - HA_ERR_KEY_NOT_FOUND should be returned). - - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no - records with prefix=lookup, we should return the last record - before that. - */ - rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); - if (rc == 0) { - const rocksdb::Slice &rkey = m_scan_it->key(); - if (!kd.covers_key(rkey)) { + case HA_READ_KEY_EXACT: + rc = read_key_exact(kd, m_scan_it, full_key_match, key_slice, + ttl_filter_ts); + break; + case HA_READ_BEFORE_KEY: + *move_forward = false; + rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); + if (rc == 0 && !kd.covers_key(m_scan_it->key())) { + /* The record we've got is not from this index */ + rc = HA_ERR_KEY_NOT_FOUND; + } + break; + case HA_READ_AFTER_KEY: + case HA_READ_KEY_OR_NEXT: + rc = read_after_key(kd, key_slice, ttl_filter_ts); + if (rc == 0 && !kd.covers_key(m_scan_it->key())) { /* The record we've got is not from this index */ rc = HA_ERR_KEY_NOT_FOUND; - } else if (find_flag == HA_READ_PREFIX_LAST) { - uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, - key, keypart_map); - rocksdb::Slice lookup_tuple(reinterpret_cast(m_sk_packed_tuple), - size); - - // We need to compare the key we've got with the original search prefix. - if (!kd.value_matches_prefix(rkey, lookup_tuple)) { + } + break; + case HA_READ_KEY_OR_PREV: + case HA_READ_PREFIX: + /* This flag is not used by the SQL layer, so we don't support it yet. */ + rc = HA_ERR_UNSUPPORTED; + break; + case HA_READ_PREFIX_LAST: + case HA_READ_PREFIX_LAST_OR_PREV: + *move_forward = false; + /* + Find the last record with the specified index prefix lookup. + - HA_READ_PREFIX_LAST requires that the record has the + prefix=lookup (if there are no such records, + HA_ERR_KEY_NOT_FOUND should be returned). + - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no + records with prefix=lookup, we should return the last record + before that. + */ + rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); + if (rc == 0) { + const rocksdb::Slice &rkey = m_scan_it->key(); + if (!kd.covers_key(rkey)) { + /* The record we've got is not from this index */ rc = HA_ERR_KEY_NOT_FOUND; + } else if (find_flag == HA_READ_PREFIX_LAST) { + uint size = kd.pack_index_tuple(table, m_pack_buffer, + m_sk_packed_tuple, key, keypart_map); + rocksdb::Slice lookup_tuple( + reinterpret_cast(m_sk_packed_tuple), size); + + // We need to compare the key we've got with the original search + // prefix. + if (!kd.value_matches_prefix(rkey, lookup_tuple)) { + rc = HA_ERR_KEY_NOT_FOUND; + } } } - } - break; - default: - DBUG_ASSERT(0); - break; + break; + default: + DBUG_ASSERT(0); + break; } return rc; @@ -7640,11 +7527,10 @@ int ha_rocksdb::position_to_correct_key( int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd, const enum ha_rkey_function &find_flag, const rocksdb::Slice &slice, - const int &bytes_changed_by_succ, + const int bytes_changed_by_succ, const key_range *const end_key, uint *const end_key_packed_size) { - if (find_flag == HA_READ_KEY_EXACT) - return slice.size(); + if (find_flag == HA_READ_KEY_EXACT) return slice.size(); if (find_flag == HA_READ_PREFIX_LAST) { /* @@ -7714,8 +7600,18 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, const rocksdb::Slice &rkey = m_scan_it->key(); const rocksdb::Slice &value = m_scan_it->value(); - bool covered_lookup = - m_keyread_only || kd.covers_lookup(table, &value, &m_lookup_bitmap); +#ifndef DBUG_OFF + bool save_keyread_only = m_keyread_only; +#endif + DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; }); + + bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) || + kd.covers_lookup(&value, &m_lookup_bitmap); + +#ifndef DBUG_OFF + m_keyread_only = save_keyread_only; +#endif + if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) { pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple); @@ -7723,12 +7619,11 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, rc = HA_ERR_ROCKSDB_CORRUPT_DATA; } else { rc = kd.unpack_record(table, buf, &rkey, &value, - m_verify_row_debug_checksums); + m_converter->get_verify_row_debug_checksums()); global_stats.covered_secondary_key_lookups.inc(); } } else { - if (kd.m_is_reverse_cf) - move_forward = !move_forward; + if (kd.m_is_reverse_cf) move_forward = !move_forward; rc = find_icp_matching_index_rec(move_forward, buf); if (!rc) { @@ -7775,11 +7670,12 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const { DBUG_ENTER_FUNC(); - ulong base_flags = HA_READ_NEXT | // doesn't seem to be used + ulong base_flags = HA_READ_NEXT | // doesn't seem to be used HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV; - if (check_keyread_allowed(inx, part, all_parts)) + if (check_keyread_allowed(inx, part, all_parts)) { base_flags |= HA_KEYREAD_ONLY; + } if (inx == table_share->primary_key) { /* @@ -7835,13 +7731,15 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { rocksdb::Slice value = m_scan_it->value(); bool covered_lookup = - m_keyread_only || m_key_descr_arr[keyno]->covers_lookup( - table, &value, &m_lookup_bitmap); + (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) || + m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap); if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) { rc = m_key_descr_arr[keyno]->unpack_record( - table, buf, &key, &value, m_verify_row_debug_checksums); + table, buf, &key, &value, + m_converter->get_verify_row_debug_checksums()); global_stats.covered_secondary_key_lookups.inc(); } else { + DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk"); rc = get_row_by_rowid(buf, m_pk_packed_tuple, size); } @@ -7879,9 +7777,10 @@ int ha_rocksdb::read_range_first(const key_range *const start_key, range_key_part = table->key_info[active_index].key_part; - if (!start_key) // Read first record + if (!start_key) { + // Read first record result = ha_index_first(table->record[0]); - else { + } else { if (is_using_prohibited_gap_locks( table, is_using_full_unique_key(active_index, start_key->keypart_map, @@ -7895,8 +7794,9 @@ int ha_rocksdb::read_range_first(const key_range *const start_key, start_key->keypart_map, start_key->flag, end_key); }) } - if (result) + if (result) { DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result); + } if (compare_key(end_range) <= 0) { DBUG_RETURN(HA_EXIT_SUCCESS); @@ -7946,15 +7846,24 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, const key_range *end_key) { DBUG_ENTER_FUNC(); + DBUG_EXECUTE_IF("myrocks_busy_loop_on_row_read", int debug_i = 0; + while (1) { debug_i++; }); + int rc = 0; + THD *thd = ha_thd(); + DEBUG_SYNC(thd, "rocksdb.check_flags_rmi"); + if (thd && thd->killed) { + rc = HA_ERR_QUERY_INTERRUPTED; + DBUG_RETURN(rc); + } + ha_statistic_increment(&SSV::ha_read_key_count); const Rdb_key_def &kd = *m_key_descr_arr[active_index]; const uint actual_key_parts = kd.get_key_parts(); bool using_full_key = is_using_full_key(keypart_map, actual_key_parts); - if (!end_key) - end_key = end_range; + if (!end_key) end_key = end_range; /* By default, we don't need the retrieved records to match the prefix */ m_sk_match_prefix = nullptr; @@ -7970,8 +7879,7 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, m_pk_packed_tuple, key, keypart_map); bool skip_lookup = is_blind_delete_enabled(); - rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, - skip_lookup, false); + rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false); if (!rc && !skip_lookup) { stats.rows_read++; @@ -7994,8 +7902,9 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, key, tmp_map); if (table->key_info[active_index].user_defined_key_parts != - kd.get_key_parts()) + kd.get_key_parts()) { using_full_key = false; + } } else { packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, key, keypart_map); @@ -8037,14 +7946,20 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, bool use_all_keys = false; if (find_flag == HA_READ_KEY_EXACT && - my_count_bits(keypart_map) == kd.get_key_parts()) + my_count_bits(keypart_map) == kd.get_key_parts()) { use_all_keys = true; + } Rdb_transaction *const tx = get_or_create_tx(table->in_use); const bool is_new_snapshot = !tx->has_snapshot(); // Loop as long as we get a deadlock error AND we end up creating the // snapshot here (i.e. it did not exist prior to this) for (;;) { + DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan"); + if (thd && thd->killed) { + rc = HA_ERR_QUERY_INTERRUPTED; + break; + } /* This will open the iterator and position it at a record that's equal or greater than the lookup tuple. @@ -8061,9 +7976,7 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, slice, &move_forward, tx->m_snapshot_timestamp); if (rc) { - /* This status is returned on any error */ - table->status = STATUS_NOT_FOUND; - DBUG_RETURN(rc); + break; } m_skip_scan_it_next_call = false; @@ -8073,13 +7986,15 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, then we have all the rows we need. For a secondary key we now need to lookup the primary key. */ - if (active_index == table->s->primary_key) + if (active_index == table->s->primary_key) { rc = read_row_from_primary_key(buf); - else + } else { rc = read_row_from_secondary_key(buf, kd, move_forward); + } - if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) + if (!should_recreate_snapshot(rc, is_new_snapshot)) { break; /* Exit the loop */ + } // release the snapshot and iterator so they will be regenerated tx->release_snapshot(); @@ -8087,7 +8002,10 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, } if (rc) { - /* the only possible error condition is record-not-found */ + /* + This status is returned on any error + the only possible error condition is record-not-found + */ table->status = STATUS_NOT_FOUND; } else { table->status = 0; @@ -8119,13 +8037,21 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, other - error code */ -int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, +int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward, uchar *const buf) { if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) { const Rdb_key_def &kd = *m_key_descr_arr[active_index]; + THD *thd = ha_thd(); while (1) { - rocksdb_skip_expired_records(kd, m_scan_it, !move_forward); + int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward); + if (rc != HA_EXIT_SUCCESS) { + return rc; + } + + if (thd && thd->killed) { + return HA_ERR_QUERY_INTERRUPTED; + } if (!is_valid(m_scan_it)) { table->status = STATUS_NOT_FOUND; @@ -8149,7 +8075,7 @@ int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, const rocksdb::Slice value = m_scan_it->value(); int err = kd.unpack_record(table, buf, &rkey, &value, - m_verify_row_debug_checksums); + m_converter->get_verify_row_debug_checksums()); if (err != HA_EXIT_SUCCESS) { return err; } @@ -8168,14 +8094,6 @@ int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, * row. */ break; } - /* - TODO: should we have this here, or RockDB handles this internally? - if (my_core::thd_killed(current_thd)) - { - rc= HA_ERR_INTERNAL_ERROR; // doesn't matter - break; - } - */ } } return HA_EXIT_SUCCESS; @@ -8213,14 +8131,14 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { // Rdb_transaction *tx= get_or_create_tx(table->in_use); // tx->snapshot= nullptr; - const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums; - m_verify_row_debug_checksums = true; - + bool save_verify_row_debug_checksums = + m_converter->get_verify_row_debug_checksums(); + m_converter->set_verify_row_debug_checksums(true); /* For each secondary index, check that we can get a PK value from it */ // NO_LINT_DEBUG sql_print_information("CHECKTABLE %s: Checking table %s", table_name, table_name); - ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true + ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true ha_rows row_checksums = ha_rows(-1); bool first_index = true; @@ -8230,20 +8148,21 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { ha_index_init(keyno, true); ha_rows rows = 0; ha_rows checksums = 0; - if (first_index) - row_checksums_at_start = m_row_checksums_checked; + if (first_index) { + row_checksums_at_start = m_converter->get_row_checksums_checked(); + } int res; // NO_LINT_DEBUG sql_print_information("CHECKTABLE %s: Checking index %s", table_name, table->key_info[keyno].name); while (1) { - if (!rows) + if (!rows) { res = index_first(table->record[0]); - else + } else { res = index_next(table->record[0]); + } - if (res == HA_ERR_END_OF_FILE) - break; + if (res == HA_ERR_END_OF_FILE) break; if (res) { // error // NO_LINT_DEBUG @@ -8264,16 +8183,18 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(), rowkey_copy.length()))) { // NO_LINT_DEBUG - sql_print_error("CHECKTABLE %s: .. row %lld: " - "failed to fetch row by rowid", - table_name, rows); + sql_print_error( + "CHECKTABLE %s: .. row %lld: " + "failed to fetch row by rowid", + table_name, rows); goto error; } longlong hidden_pk_id = 0; if (has_hidden_pk(table) && - read_hidden_pk_id_from_rowkey(&hidden_pk_id)) + read_hidden_pk_id_from_rowkey(&hidden_pk_id)) { goto error; + } /* Check if we get the same PK value */ uint packed_size = m_pk_descr->pack_record( @@ -8294,9 +8215,10 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { if (packed_size != sec_key_copy.length() || memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) { // NO_LINT_DEBUG - sql_print_error("CHECKTABLE %s: .. row %lld: " - "secondary index value mismatch", - table_name, rows); + sql_print_error( + "CHECKTABLE %s: .. row %lld: " + "secondary index value mismatch", + table_name, rows); goto print_and_error; } rows++; @@ -8323,12 +8245,14 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { } } // NO_LINT_DEBUG - sql_print_information("CHECKTABLE %s: ... %lld index entries checked " - "(%lld had checksums)", - table_name, rows, checksums); + sql_print_information( + "CHECKTABLE %s: ... %lld index entries checked " + "(%lld had checksums)", + table_name, rows, checksums); if (first_index) { - row_checksums = m_row_checksums_checked - row_checksums_at_start; + row_checksums = + m_converter->get_row_checksums_checked() - row_checksums_at_start; first_index = false; } ha_index_end(); @@ -8341,16 +8265,16 @@ int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { } extra(HA_EXTRA_NO_KEYREAD); - m_verify_row_debug_checksums = save_verify_row_debug_checksums; + m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums); /* - TODO: we should check also for PK records that are missing in the secondary - indexes. + TODO(sergiy): we should check also for PK records that are missing in + the secondary indexes. For that, need to walk through the PK and check that every PK record has a proper counterpart in each secondary index. */ DBUG_RETURN(HA_ADMIN_OK); error: - m_verify_row_debug_checksums = save_verify_row_debug_checksums; + m_converter->set_verify_row_debug_checksums(save_verify_row_debug_checksums); ha_index_or_rnd_end(); extra(HA_EXTRA_NO_KEYREAD); @@ -8360,10 +8284,11 @@ error: static void dbug_dump_str(FILE *const out, const char *const str, int len) { fprintf(out, "\""); for (int i = 0; i < len; i++) { - if (str[i] > 32) + if (str[i] > 32) { fprintf(out, "%c", str[i]); - else + } else { fprintf(out, "\\%d", str[i]); + } } fprintf(out, "\""); } @@ -8376,8 +8301,7 @@ static void dbug_dump_str(FILE *const out, const char *const str, int len) { void dbug_dump_database(rocksdb::DB *const db) { FILE *const out = fopen("/tmp/rocksdb.dump", "wt"); - if (!out) - return; + if (!out) return; rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions()); for (it->SeekToFirst(); it->Valid(); it->Next()) { @@ -8399,30 +8323,23 @@ rocksdb::Status ha_rocksdb::get_for_update( DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE); const bool exclusive = m_lock_rows != RDB_LOCK_READ; - rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive); - - // If we have a lock conflict and we are running in READ COMMITTTED mode - // release and reacquire the snapshot and then retry the get_for_update(). - if (s.IsBusy() && !s.IsDeadlock() && - my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) { - tx->release_snapshot(); - tx->acquire_snapshot(false); - - s = tx->get_for_update(column_family, key, value, exclusive); - } + const bool do_validate = + my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED; + rocksdb::Status s = + tx->get_for_update(column_family, key, value, exclusive, do_validate); +#ifndef DBUG_OFF + ++rocksdb_num_get_for_update_calls; +#endif return s; } -bool ha_rocksdb::is_blind_delete_enabled() -{ +bool ha_rocksdb::is_blind_delete_enabled() { THD *thd = ha_thd(); return (THDVAR(thd, blind_delete_primary_key) && thd->lex->sql_command == SQLCOM_DELETE && - thd->lex->table_count == 1 && - table->s->keys == 1 && - !has_hidden_pk(table) && - !thd->rli_slave); + thd->lex->table_count == 1 && table->s->keys == 1 && + !has_hidden_pk(table) && !thd->rli_slave); } /* @@ -8450,8 +8367,9 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid"); DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", { THD *thd = ha_thd(); - const char act[] = "now signal Reached " - "wait_for signal.rocksdb.get_row_by_rowid_let_running"; + const char act[] = + "now signal Reached " + "wait_for signal.rocksdb.get_row_by_rowid_let_running"; DBUG_ASSERT(opt_debug_sync_timeout > 0); DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act))); };); @@ -8460,8 +8378,7 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, rocksdb::Status s; /* Pretend row found without looking up */ - if (skip_lookup) - { + if (skip_lookup) { stats.rows_deleted_blind++; update_row_stats(ROWS_DELETED_BLIND); m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin); @@ -8472,6 +8389,17 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, if (m_lock_rows == RDB_LOCK_NONE) { tx->acquire_snapshot(true); s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record); + } else if (m_insert_with_update && m_dup_pk_found) { + DBUG_ASSERT(m_pk_descr->get_keyno() == m_dupp_errkey); + DBUG_ASSERT(m_dup_pk_retrieved_record.length() == + m_retrieved_record.size()); + DBUG_ASSERT(memcmp(m_dup_pk_retrieved_record.ptr(), + m_retrieved_record.data(), + m_retrieved_record.size()) == 0); + + // do nothing - we already have the result in m_retrieved_record and + // already taken the lock + s = rocksdb::Status::OK(); } else { s = get_for_update(tx, m_pk_descr->get_cf(), key_slice, &m_retrieved_record); @@ -8527,8 +8455,7 @@ int ha_rocksdb::index_next(uchar *const buf) { } int rc = index_next_with_direction(buf, moves_forward); - if (rc == HA_ERR_KEY_NOT_FOUND) - rc = HA_ERR_END_OF_FILE; + if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE; DBUG_RETURN(rc); } @@ -8548,8 +8475,7 @@ int ha_rocksdb::index_prev(uchar *const buf) { } int rc = index_next_with_direction(buf, moves_forward); - if (rc == HA_ERR_KEY_NOT_FOUND) - rc = HA_ERR_END_OF_FILE; + if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE; DBUG_RETURN(rc); } @@ -8562,19 +8488,33 @@ int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) { if (active_index == pk_index(table, m_tbl_def)) { rc = rnd_next_with_direction(buf, move_forward); } else { - if (m_skip_scan_it_next_call) { - m_skip_scan_it_next_call = false; - } else { - if (move_forward) - m_scan_it->Next(); /* this call cannot fail */ - else - m_scan_it->Prev(); + THD *thd = ha_thd(); + for (;;) { + DEBUG_SYNC(thd, "rocksdb.check_flags_inwd"); + if (thd && thd->killed) { + rc = HA_ERR_QUERY_INTERRUPTED; + break; + } + if (m_skip_scan_it_next_call) { + m_skip_scan_it_next_call = false; + } else { + if (move_forward) { + m_scan_it->Next(); /* this call cannot fail */ + } else { + m_scan_it->Prev(); + } + } + rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index], + m_scan_it, !move_forward); + if (rc != HA_EXIT_SUCCESS) { + break; + } + rc = find_icp_matching_index_rec(move_forward, buf); + if (!rc) rc = secondary_index_read(active_index, buf); + if (!should_skip_invalidated_record(rc)) { + break; + } } - rocksdb_skip_expired_records(*m_key_descr_arr[active_index], m_scan_it, - !move_forward); - rc = find_icp_matching_index_rec(move_forward, buf); - if (!rc) - rc = secondary_index_read(active_index, buf); } DBUG_RETURN(rc); @@ -8593,8 +8533,7 @@ int ha_rocksdb::index_first(uchar *const buf) { int rc = m_key_descr_arr[active_index]->m_is_reverse_cf ? index_last_intern(buf) : index_first_intern(buf); - if (rc == HA_ERR_KEY_NOT_FOUND) - rc = HA_ERR_END_OF_FILE; + if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE; DBUG_RETURN(rc); } @@ -8612,8 +8551,7 @@ int ha_rocksdb::index_last(uchar *const buf) { int rc = m_key_descr_arr[active_index]->m_is_reverse_cf ? index_first_intern(buf) : index_last_intern(buf); - if (rc == HA_ERR_KEY_NOT_FOUND) - rc = HA_ERR_END_OF_FILE; + if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE; DBUG_RETURN(rc); } @@ -8681,8 +8619,9 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { m_skip_scan_it_next_call = true; rc = index_next_with_direction(buf, true); - if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) - break; // exit the loop + if (!should_recreate_snapshot(rc, is_new_snapshot)) { + break; /* exit the loop */ + } // release the snapshot and iterator so they will be regenerated tx->release_snapshot(); @@ -8754,11 +8693,7 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { DBUG_ASSERT(key != nullptr); const Rdb_key_def &kd = *m_key_descr_arr[active_index]; - if (kd.m_is_reverse_cf) { - kd.get_infimum_key(key, &key_size); - } else { - kd.get_supremum_key(key, &key_size); - } + int key_end_matching_bytes = kd.get_last_key(key, &key_size); rocksdb::Slice index_key((const char *)key, key_size); @@ -8769,7 +8704,7 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { // Loop as long as we get a deadlock error AND we end up creating the // snapshot here (i.e. it did not exist prior to this) for (;;) { - setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE); + setup_scan_iterator(kd, &index_key, false, key_end_matching_bytes); m_scan_it->SeekForPrev(index_key); m_skip_scan_it_next_call = false; @@ -8778,12 +8713,12 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { rc = rnd_next_with_direction(buf, false); } else { rc = find_icp_matching_index_rec(false /*move_forward*/, buf); - if (!rc) - rc = secondary_index_read(active_index, buf); + if (!rc) rc = secondary_index_read(active_index, buf); } - if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) + if (!should_recreate_snapshot(rc, is_new_snapshot)) { break; /* exit the loop */ + } // release the snapshot and iterator so they will be regenerated tx->release_snapshot(); @@ -8820,7 +8755,7 @@ void ha_rocksdb::unlock_row() { - If the index is PRIMARY KEY, and if all of the columns of the table are covered by the PRIMARY KEY, SingleDelete can be used. */ -bool ha_rocksdb::can_use_single_delete(const uint &index) const { +bool ha_rocksdb::can_use_single_delete(const uint index) const { return (index != pk_index(table, m_tbl_def) || (!has_hidden_pk(table) && table->key_info[index].actual_key_parts == table->s->fields)); @@ -8836,12 +8771,14 @@ bool ha_rocksdb::skip_unique_check() const { 3) the user set unique_checks option to 0, and the table does not have any indexes. If the table has secondary keys, then those might becomes inconsisted/corrupted + 4) We're using read-free replication */ return THDVAR(table->in_use, bulk_load) || (m_force_skip_unique_check && m_skip_unique_check) || (my_core::thd_test_options(table->in_use, OPTION_RELAXED_UNIQUE_CHECKS) && - m_tbl_def->m_key_count == 1); + m_tbl_def->m_key_count == 1) || + use_read_free_rpl(); } void ha_rocksdb::set_force_skip_unique_check(bool skip) { @@ -8939,10 +8876,9 @@ const char *ha_rocksdb::get_key_comment(const uint index, return table_arg->key_info[index].comment.str; } -const std::string ha_rocksdb::generate_cf_name(const uint index, - const TABLE *const table_arg, - const Rdb_tbl_def *const tbl_def_arg, - bool *per_part_match_found) { +const std::string ha_rocksdb::generate_cf_name( + const uint index, const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found) { DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(tbl_def_arg != nullptr); DBUG_ASSERT(per_part_match_found != nullptr); @@ -8987,6 +8923,9 @@ const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) { } /** + Write a new row + + @param[in] buf new row data to write @return HA_EXIT_SUCCESS OK other HA_ERR error code (can be SE-specific) @@ -9010,6 +8949,11 @@ int ha_rocksdb::write_row(uchar *const buf) { } } + // clear cache at beginning of write for INSERT ON DUPLICATE + // we may get multiple write->fail->read->update if there are multiple + // values from INSERT + m_dup_pk_found = false; + const int rv = update_write_row(nullptr, buf, skip_unique_check()); if (rv == 0) { @@ -9038,6 +8982,15 @@ void ha_rocksdb::set_last_rowkey(const uchar *const old_data) { } } +/** + Collect update data for primary key + + @param[in, out] row_info hold all data for update row, such as + new row data/old row data + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) { int size; @@ -9050,10 +9003,9 @@ int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) { row_info->new_pk_unpack_info = &m_pk_unpack_info; - size = - m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data, - m_pk_packed_tuple, row_info->new_pk_unpack_info, - false, 0, 0, nullptr, &row_info->ttl_pk_offset); + size = m_pk_descr->pack_record( + table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple, + row_info->new_pk_unpack_info, false, 0, 0, nullptr); } else if (row_info->old_data == nullptr) { row_info->hidden_pk_id = update_hidden_pk_val(); size = @@ -9077,30 +9029,28 @@ int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) { return HA_EXIT_SUCCESS; } -int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, +/** + Check the specified primary key value is unique and also lock the row + + @param[in] key_id key index + @param[in] row_info hold all data for update row, such as old row + data and new row data + @param[out] found whether the primary key exists before. + @param[out] pk_changed whether primary key is changed + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ +int ha_rocksdb::check_and_lock_unique_pk(const uint key_id, const struct update_row_info &row_info, - bool *const found, - bool *const pk_changed) { + bool *const found) { DBUG_ASSERT(found != nullptr); - DBUG_ASSERT(pk_changed != nullptr); - - *pk_changed = false; - /* - For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs - always require locking. - */ - if (row_info.old_pk_slice.size() > 0) { - /* - If the keys are the same, then no lock is needed - */ - if (!row_info.new_pk_slice.compare(row_info.old_pk_slice)) { - *found = false; - return HA_EXIT_SUCCESS; - } + DBUG_ASSERT(row_info.old_pk_slice.size() == 0 || + row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0); - *pk_changed = true; - } + /* Ignore PK violations if this is a optimized 'replace into' */ + const bool ignore_pk_unique_check = ha_thd()->lex->blind_replace_into; /* Perform a read to determine if a duplicate entry exists. For primary @@ -9123,17 +9073,56 @@ int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, */ const rocksdb::Status s = get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice, - &m_retrieved_record); + ignore_pk_unique_check ? nullptr : &m_retrieved_record); if (!s.ok() && !s.IsNotFound()) { return row_info.tx->set_status_error( table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler); } - *found = !s.IsNotFound(); + bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound(); + + /* + If the pk key has ttl, we may need to pretend the row wasn't + found if it is already expired. + */ + if (key_found && m_pk_descr->has_ttl() && + should_hide_ttl_rec(*m_pk_descr, m_retrieved_record, + (row_info.tx->m_snapshot_timestamp + ? row_info.tx->m_snapshot_timestamp + : static_cast(std::time(nullptr))))) { + key_found = false; + } + + if (key_found && row_info.old_data == nullptr && m_insert_with_update) { + // In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed + // due to a duplicate key, remember the last key and skip the check + // next time + m_dup_pk_found = true; + +#ifndef DBUG_OFF + // save it for sanity checking later + m_dup_pk_retrieved_record.copy(m_retrieved_record.data(), + m_retrieved_record.size(), &my_charset_bin); +#endif + } + + *found = key_found; + return HA_EXIT_SUCCESS; } -int ha_rocksdb::check_and_lock_sk(const uint &key_id, +/** + Check the specified secondary key value is unique and also lock the row + + @param[in] key_id key index + @param[in] row_info hold all data for update row, such as old row + data and new row data + @param[out] found whether specified key value exists before. + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ +int ha_rocksdb::check_and_lock_sk(const uint key_id, const struct update_row_info &row_info, bool *const found) { DBUG_ASSERT(found != nullptr); @@ -9263,8 +9252,18 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, return HA_EXIT_SUCCESS; } +/** + Enumerate all keys to check their uniquess and also lock it + + @param[in] row_info hold all data for update row, such as old row + data and new row data + @param[out] pk_changed whether primary key is changed + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ int ha_rocksdb::check_uniqueness_and_lock( - const struct update_row_info &row_info, bool *const pk_changed) { + const struct update_row_info &row_info, bool pk_changed) { /* Go through each index and determine if the index has uniqueness requirements. If it does, then try to obtain a row lock on the new values. @@ -9276,7 +9275,12 @@ int ha_rocksdb::check_uniqueness_and_lock( int rc; if (is_pk(key_id, table, m_tbl_def)) { - rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed); + if (row_info.old_pk_slice.size() > 0 && !pk_changed) { + found = false; + rc = HA_EXIT_SUCCESS; + } else { + rc = check_and_lock_unique_pk(key_id, row_info, &found); + } } else { rc = check_and_lock_sk(key_id, row_info, &found); } @@ -9285,23 +9289,11 @@ int ha_rocksdb::check_uniqueness_and_lock( return rc; } - /* - If the pk key has ttl, we may need to pretend the row wasn't - found if it is already expired. The pk record is read into - m_retrieved_record by check_and_lock_unique_pk(). - */ - if (is_pk(key_id, table, m_tbl_def) && found && m_pk_descr->has_ttl() && - should_hide_ttl_rec(*m_pk_descr, m_retrieved_record, - (row_info.tx->m_snapshot_timestamp - ? row_info.tx->m_snapshot_timestamp - : static_cast(std::time(nullptr))))) { - found = false; - } - if (found) { /* There is a row with this key already, so error out. */ errkey = key_id; m_dupp_errkey = errkey; + return HA_ERR_FOUND_DUPP_KEY; } } @@ -9309,19 +9301,31 @@ int ha_rocksdb::check_uniqueness_and_lock( return HA_EXIT_SUCCESS; } +/** + Check whether secondary key value is duplicate or not + + @param[in] table_arg the table currently working on + @param[in key_def the key_def is being checked + @param[in] key secondary key storage data + @param[out] sk_info hold secondary key memcmp datas(new/old) + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ + int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg, - const Rdb_key_def &index, + const Rdb_key_def &key_def, const rocksdb::Slice *key, struct unique_sk_buf_info *sk_info) { uint n_null_fields = 0; - const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator(); + const rocksdb::Comparator *index_comp = key_def.get_cf()->GetComparator(); /* Get proper SK buffer. */ uchar *sk_buf = sk_info->swap_and_get_sk_buf(); /* Get memcmp form of sk without extended pk tail */ uint sk_memcmp_size = - index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields); + key_def.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields); sk_info->sk_memcmp_key = rocksdb::Slice(reinterpret_cast(sk_buf), sk_memcmp_size); @@ -9341,12 +9345,17 @@ int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, const rocksdb::Slice &value, bool sort) { DBUG_ENTER_FUNC(); int res; + THD *thd = ha_thd(); + if (thd && thd->killed) { + DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); + } + rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); // In the case of unsorted inserts, m_sst_info allocated here is not // used to store the keys. It is still used to indicate when tables // are switched. - if (m_sst_info == nullptr || m_sst_info->is_committed()) { + if (m_sst_info == nullptr || m_sst_info->is_done()) { m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name, kd.get_name(), cf, *rocksdb_db_options, THDVAR(ha_thd(), trace_sst_api))); @@ -9379,17 +9388,61 @@ int ha_rocksdb::finalize_bulk_load(bool print_client_error) { /* Skip if there are no possible ongoing bulk loads */ if (m_sst_info) { - res = m_sst_info->commit(print_client_error); + if (m_sst_info->is_done()) { + m_sst_info.reset(); + DBUG_RETURN(res); + } + + Rdb_sst_info::Rdb_sst_commit_info commit_info; + + // Wrap up the current work in m_sst_info and get ready to commit + // This transfer the responsibility of commit over to commit_info + res = m_sst_info->finish(&commit_info, print_client_error); + if (res == 0) { + // Make sure we have work to do - under race condition we could lose + // to another thread and end up with no work + if (commit_info.has_work()) { + rocksdb::IngestExternalFileOptions opts; + opts.move_files = true; + opts.snapshot_consistency = false; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + + const rocksdb::Status s = rdb->IngestExternalFile( + commit_info.get_cf(), commit_info.get_committed_files(), opts); + if (!s.ok()) { + if (print_client_error) { + Rdb_sst_info::report_error_msg(s, nullptr); + } + res = HA_ERR_ROCKSDB_BULK_LOAD; + } else { + // Mark the list of SST files as committed, otherwise they'll get + // cleaned up when commit_info destructs + commit_info.commit(); + } + } + } m_sst_info.reset(); } DBUG_RETURN(res); } -int ha_rocksdb::update_pk(const Rdb_key_def &kd, - const struct update_row_info &row_info, - const bool &pk_changed) { - const uint key_id = kd.get_keyno(); - const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def); +/** + Update an existing primary key record or write a new primary key record + + @param[in] kd the primary key is being update/write + @param[in] update_row_info hold all row data, such as old row data and + new row data + @param[in] pk_changed whether primary key is changed + @return + HA_EXIT_SUCCESS OK + Other HA_ERR error code (can be SE-specific) + */ +int ha_rocksdb::update_write_pk(const Rdb_key_def &kd, + const struct update_row_info &row_info, + bool pk_changed) { + uint key_id = kd.get_keyno(); + bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def); ulonglong bytes_written = 0; /* @@ -9417,7 +9470,10 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, int rc = HA_EXIT_SUCCESS; rocksdb::Slice value_slice; /* Prepare the new record to be written into RocksDB */ - if ((rc = convert_record_to_storage_format(row_info, &value_slice))) { + if ((rc = m_converter->encode_value_slice( + m_pk_descr, row_info.new_pk_slice, row_info.new_pk_unpack_info, + !row_info.old_pk_slice.empty(), should_store_row_debug_checksums(), + m_ttl_bytes, &m_ttl_bytes_updated, &value_slice))) { return rc; } @@ -9437,7 +9493,9 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice, value_slice); } else { - const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice); + const bool assume_tracked = can_assume_tracked(ha_thd()); + const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice, + assume_tracked); if (!s.ok()) { if (s.IsBusy()) { errkey = table->s->primary_key; @@ -9457,9 +9515,22 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, return rc; } -int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info, - const bool bulk_load_sk) { +/** + update an existing secondary key record or write a new secondary key record + + @param[in] table_arg Table we're working on + @param[in] kd The secondary key being update/write + @param[in] row_info data structure contains old row data and new row data + @param[in] bulk_load_sk whether support bulk load. Currently it is only + support for write + @return + HA_EXIT_SUCCESS OK + Other HA_ERR error code (can be SE-specific) + */ +int ha_rocksdb::update_write_sk(const TABLE *const table_arg, + const Rdb_key_def &kd, + const struct update_row_info &row_info, + const bool bulk_load_sk) { int new_packed_size; int old_packed_size; int rc = HA_EXIT_SUCCESS; @@ -9481,19 +9552,18 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, return HA_EXIT_SUCCESS; } - const bool store_row_debug_checksums = should_store_row_debug_checksums(); - + bool store_row_debug_checksums = should_store_row_debug_checksums(); new_packed_size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums, - row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes); + row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes); if (row_info.old_data != nullptr) { // The old value old_packed_size = kd.pack_record( table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0, - nullptr, nullptr, m_ttl_bytes); + nullptr, m_ttl_bytes); /* Check if we are going to write the same value. This can happen when @@ -9553,13 +9623,22 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, return rc; } -int ha_rocksdb::update_indexes(const struct update_row_info &row_info, - const bool &pk_changed) { +/** + Update existing indexes(PK/SKs) or write new indexes(PK/SKs) + + @param[in] row_info hold all row data, such as old key/new key + @param[in] pk_changed whether primary key is changed + @return + HA_EXIT_SUCCESS OK + Other HA_ERR error code (can be SE-specific) + */ +int ha_rocksdb::update_write_indexes(const struct update_row_info &row_info, + const bool pk_changed) { int rc; bool bulk_load_sk; // The PK must be updated first to pull out the TTL value. - rc = update_pk(*m_pk_descr, row_info, pk_changed); + rc = update_write_pk(*m_pk_descr, row_info, pk_changed); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -9574,7 +9653,8 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, continue; } - rc = update_sk(table, *m_key_descr_arr[key_id], row_info, bulk_load_sk); + rc = update_write_sk(table, *m_key_descr_arr[key_id], row_info, + bulk_load_sk); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -9583,11 +9663,26 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, return HA_EXIT_SUCCESS; } +/** + Update an existing row or write a new row + + @param[in] old_data nullptr for write, non-null for update + @param[in] new_data non-null for write/update + @param[in] skip_unique_check whether to check uniqueness + @return + HA_EXIT_SUCCESS OK + Other HA_ERR error code (can be SE-specific) + */ int ha_rocksdb::update_write_row(const uchar *const old_data, const uchar *const new_data, const bool skip_unique_check) { DBUG_ENTER_FUNC(); + THD *thd = ha_thd(); + if (thd && thd->killed) { + DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); + } + bool pk_changed = false; struct update_row_info row_info; @@ -9595,7 +9690,6 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, row_info.new_data = new_data; row_info.skip_unique_check = skip_unique_check; row_info.new_pk_unpack_info = nullptr; - set_last_rowkey(old_data); row_info.tx = get_or_create_tx(table->in_use); @@ -9616,12 +9710,20 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, DBUG_RETURN(rc); } + /* + For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs + always require locking. + */ + if (row_info.old_pk_slice.size() > 0) { + pk_changed = row_info.new_pk_slice.compare(row_info.old_pk_slice) != 0; + } + if (!skip_unique_check) { /* Check to see if we are going to have failures because of unique keys. Also lock the appropriate key values. */ - rc = check_uniqueness_and_lock(row_info, &pk_changed); + rc = check_uniqueness_and_lock(row_info, pk_changed); if (rc != HA_EXIT_SUCCESS) { DBUG_RETURN(rc); } @@ -9635,7 +9737,7 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, here because updates to the transaction will be made and those updates cannot be easily removed without rolling back the entire transaction. */ - rc = update_indexes(row_info, pk_changed); + rc = update_write_indexes(row_info, pk_changed); if (rc != HA_EXIT_SUCCESS) { DBUG_RETURN(rc); } @@ -9671,18 +9773,29 @@ void ha_rocksdb::setup_iterator_bounds( const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len, uchar *const lower_bound, uchar *const upper_bound, rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) { - uint min_len = std::min(eq_cond.size(), bound_len); - memcpy(upper_bound, eq_cond.data(), min_len); - kd.successor(upper_bound, min_len); - memcpy(lower_bound, eq_cond.data(), min_len); - kd.predecessor(lower_bound, min_len); + // If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be + // able to get better bounds just by using index id directly. + if (eq_cond.size() <= Rdb_key_def::INDEX_NUMBER_SIZE) { + DBUG_ASSERT(bound_len == Rdb_key_def::INDEX_NUMBER_SIZE); + uint size; + kd.get_infimum_key(lower_bound, &size); + DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE); + kd.get_supremum_key(upper_bound, &size); + DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE); + } else { + DBUG_ASSERT(bound_len <= eq_cond.size()); + memcpy(upper_bound, eq_cond.data(), bound_len); + kd.successor(upper_bound, bound_len); + memcpy(lower_bound, eq_cond.data(), bound_len); + kd.predecessor(lower_bound, bound_len); + } if (kd.m_is_reverse_cf) { - *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); - *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); + *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len); + *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len); } else { - *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); - *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); + *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, bound_len); + *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, bound_len); } } @@ -9701,8 +9814,17 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, bool skip_bloom = true; const rocksdb::Slice eq_cond(slice->data(), eq_cond_len); + // The size of m_scan_it_lower_bound (and upper) is technically + // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather + // than recalculating that number, we pass in the max of eq_cond_len and + // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than + // max_packed_sk_len, hence ensuring no buffer overrun. + // + // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is + // used. if (check_bloom_and_set_bounds( - ha_thd(), kd, eq_cond, use_all_keys, eq_cond_len, + ha_thd(), kd, eq_cond, use_all_keys, + std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_NUMBER_SIZE), m_scan_it_lower_bound, m_scan_it_upper_bound, &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) { skip_bloom = false; @@ -9740,7 +9862,8 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, m_scan_it_snapshot = rdb->GetSnapshot(); auto read_opts = rocksdb::ReadOptions(); - read_opts.total_order_seek = true; // TODO: set based on WHERE conditions + // TODO(mung): set based on WHERE conditions + read_opts.total_order_seek = true; read_opts.snapshot = m_scan_it_snapshot; m_scan_it = rdb->NewIterator(read_opts, kd.get_cf()); } else { @@ -9765,12 +9888,12 @@ void ha_rocksdb::release_scan_iterator() { void ha_rocksdb::setup_iterator_for_rnd_scan() { uint key_size; - int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); + int key_start_matching_bytes = + m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size); - setup_scan_iterator(*m_pk_descr, &table_key, false, - key_start_matching_bytes); + setup_scan_iterator(*m_pk_descr, &table_key, false, key_start_matching_bytes); m_scan_it->Seek(table_key); m_skip_scan_it_next_call = true; } @@ -9783,8 +9906,16 @@ void ha_rocksdb::setup_iterator_for_rnd_scan() { int ha_rocksdb::rnd_init(bool scan) { DBUG_ENTER_FUNC(); + THD *thd = ha_thd(); + if (thd && thd->killed) { + DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); + } + Rdb_transaction *const tx = get_or_create_tx(table->in_use); - setup_read_decoders(); + + // when this table is being updated, decode all fields + m_converter->setup_field_decoders(table->read_set, + m_lock_rows == RDB_LOCK_WRITE); if (scan) { m_rnd_scan_is_new_snapshot = !tx->has_snapshot(); @@ -9813,9 +9944,9 @@ int ha_rocksdb::rnd_next(uchar *const buf) { ha_statistic_increment(&SSV::ha_read_rnd_next_count); for (;;) { rc = rnd_next_with_direction(buf, true); - if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !m_rnd_scan_is_new_snapshot) + if (!should_recreate_snapshot(rc, m_rnd_scan_is_new_snapshot)) { break; /* exit the loop */ - + } // release the snapshot and iterator and then regenerate them Rdb_transaction *tx = get_or_create_tx(table->in_use); tx->release_snapshot(); @@ -9825,8 +9956,7 @@ int ha_rocksdb::rnd_next(uchar *const buf) { m_rnd_scan_is_new_snapshot = false; - if (rc == HA_ERR_KEY_NOT_FOUND) - rc = HA_ERR_END_OF_FILE; + if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE; DBUG_RETURN(rc); } @@ -9838,6 +9968,7 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { DBUG_ENTER_FUNC(); int rc; + THD *thd = ha_thd(); table->status = STATUS_NOT_FOUND; stats.rows_requested++; @@ -9855,13 +9986,20 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { } for (;;) { + DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd"); + if (thd && thd->killed) { + rc = HA_ERR_QUERY_INTERRUPTED; + break; + } + if (m_skip_scan_it_next_call) { m_skip_scan_it_next_call = false; } else { - if (move_forward) + if (move_forward) { m_scan_it->Next(); /* this call cannot fail */ - else + } else { m_scan_it->Prev(); /* this call cannot fail */ + } } if (!is_valid(m_scan_it)) { @@ -9895,11 +10033,7 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { const rocksdb::Status s = get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record); if (s.IsNotFound() && - my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) { - // This occurs if we accessed a row, tried to lock it, failed, - // released and reacquired the snapshot (because of READ COMMITTED - // mode) and the row was deleted by someone else in the meantime. - // If so, we just want to move on to the next row. + should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) { continue; } @@ -9957,10 +10091,17 @@ int ha_rocksdb::rnd_end() { int ha_rocksdb::index_init(uint idx, bool sorted) { DBUG_ENTER_FUNC(); + THD *thd = ha_thd(); + if (thd && thd->killed) { + DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); + } + Rdb_transaction *const tx = get_or_create_tx(table->in_use); DBUG_ASSERT(tx != nullptr); - setup_read_decoders(); + // when this table is being updated, decode all fields + m_converter->setup_field_decoders(table->read_set, + m_lock_rows == RDB_LOCK_WRITE); if (!m_keyread_only) { m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap); @@ -10003,10 +10144,20 @@ int ha_rocksdb::truncate() { DBUG_ASSERT(m_tbl_def != nullptr); - m_tbl_def->m_auto_incr_val = 1; - m_tbl_def->m_hidden_pk_val = 1; + // Save table name to use later + std::string table_name = m_tbl_def->full_tablename(); + + // Delete current table + int err = delete_table(m_tbl_def); + if (err != HA_EXIT_SUCCESS) { + DBUG_RETURN(err); + } - DBUG_RETURN(remove_rows(m_tbl_def)); + // Reset auto_increment_value to 1 if auto-increment feature is enabled + // By default, the starting valid value for auto_increment_value is 1 + DBUG_RETURN(create_table( + table_name, table, + table->found_next_number_field ? 1 : 0 /* auto_increment_value */)); } /* @@ -10083,9 +10234,11 @@ rocksdb::Status ha_rocksdb::delete_or_singledelete( uint index, Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key) { - if (can_use_single_delete(index)) - return tx->single_delete(column_family, key); - return tx->delete_key(column_family, key); + const bool assume_tracked = can_assume_tracked(ha_thd()); + if (can_use_single_delete(index)) { + return tx->single_delete(column_family, key, assume_tracked); + } + return tx->delete_key(column_family, key, assume_tracked); } void ha_rocksdb::update_stats(void) { @@ -10116,8 +10269,9 @@ void ha_rocksdb::update_stats(void) { int ha_rocksdb::info(uint flag) { DBUG_ENTER_FUNC(); - if (!table) + if (!table) { DBUG_RETURN(HA_EXIT_FAILURE); + } if (flag & HA_STATUS_VARIABLE) { /* @@ -10145,10 +10299,8 @@ int ha_rocksdb::info(uint flag) { // if number of records is hardcoded, we do not want to force computation // of memtable cardinalities - if (stats.records == 0 || - (rocksdb_force_compute_memtable_stats && - rocksdb_debug_optimizer_n_rows == 0)) - { + if (stats.records == 0 || (rocksdb_force_compute_memtable_stats && + rocksdb_debug_optimizer_n_rows == 0)) { // First, compute SST files stats uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; auto r = get_range(pk_index(table, m_tbl_def), buf); @@ -10159,8 +10311,8 @@ int ha_rocksdb::info(uint flag) { if (stats.records == 0) { rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, include_flags); - stats.records+= sz/ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE; - stats.data_file_length+= sz; + stats.records += sz / ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE; + stats.data_file_length += sz; } // Second, compute memtable stats. This call is expensive, so cache @@ -10198,14 +10350,17 @@ int ha_rocksdb::info(uint flag) { stats.records += m_table_handler->m_mtcache_count; stats.data_file_length += m_table_handler->m_mtcache_size; } + } - if (rocksdb_debug_optimizer_n_rows > 0) - stats.records = rocksdb_debug_optimizer_n_rows; + if (rocksdb_debug_optimizer_n_rows > 0) { + stats.records = rocksdb_debug_optimizer_n_rows; } - if (stats.records != 0) + if (stats.records != 0) { stats.mean_rec_length = stats.data_file_length / stats.records; + } } + if (flag & HA_STATUS_CONST) { ref_length = m_pk_descr->max_storage_fmt_length(); @@ -10231,8 +10386,7 @@ int ha_rocksdb::info(uint flag) { } else { x = 0; } - if (x > stats.records) - x = stats.records; + if (x > stats.records) x = stats.records; if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) || rocksdb_debug_optimizer_n_rows > 0) { // Fake cardinality implementation. For example, (idx1, idx2, idx3) @@ -10253,7 +10407,7 @@ int ha_rocksdb::info(uint flag) { uniqueness violation. */ errkey = m_dupp_errkey; - dup_ref = m_pk_tuple; // TODO(?): this should store packed PK. + dup_ref = m_pk_tuple; // TODO(?): this should store packed PK. } if (flag & HA_STATUS_AUTO) { @@ -10267,8 +10421,9 @@ void ha_rocksdb::position(const uchar *const record) { DBUG_ENTER_FUNC(); longlong hidden_pk_id = 0; - if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) - DBUG_ASSERT(false); // should never reach here + if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) { + DBUG_ASSERT(false); // should never reach here + } /* Get packed primary key value from the record. @@ -10290,8 +10445,9 @@ void ha_rocksdb::position(const uchar *const record) { It could be that mem-comparable form of PK occupies less than ref_length bytes. Fill the remainder with zeros. */ - if (ref_length > packed_size) + if (ref_length > packed_size) { memset(ref + packed_size, 0, ref_length - packed_size); + } DBUG_VOID_RETURN; } @@ -10346,8 +10502,7 @@ void ha_rocksdb::calc_updated_indexes() { /* Walk over all key parts, including the "extended key" suffix */ const uint key_parts = kd.get_key_parts(); for (uint kp = 0; kp < key_parts; kp++) { - if (has_hidden_pk(table) && kp + 1 == key_parts) - break; + if (has_hidden_pk(table) && kp + 1 == key_parts) break; Field *const field = kd.get_table_field_for_part_no(table, kp); if (bitmap_is_set(table->write_set, field->field_index)) { @@ -10360,6 +10515,9 @@ void ha_rocksdb::calc_updated_indexes() { } /** + Update an existing row + @param[in] old_data nullptr for write, non-null for update + @param[in] new_data non-null for write/update @return HA_EXIT_SUCCESS OK other HA_ERR error code (can be SE-specific) @@ -10378,7 +10536,7 @@ int ha_rocksdb::update_row(const uchar *const old_data, uchar *const new_data) { DBUG_ASSERT(new_data == table->record[0]); ha_statistic_increment(&SSV::ha_update_count); - const int rv = update_write_row(old_data, new_data, false); + const int rv = update_write_row(old_data, new_data, skip_unique_check()); if (rv == 0) { stats.rows_updated++; @@ -10468,7 +10626,8 @@ THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to, void ha_rocksdb::read_thd_vars(THD *const thd) { m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums); - m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums); + m_converter->set_verify_row_debug_checksums( + THDVAR(thd, verify_row_debug_checksums)); m_checksums_pct = THDVAR(thd, checksums_pct); } @@ -10500,7 +10659,8 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { tx->io_perf_end_and_record(&m_io_perf); tx->m_n_mysql_tables_in_use--; if (tx->m_n_mysql_tables_in_use == 0 && - !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + !my_core::thd_test_options(thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { /* Do like InnoDB: when we get here, it's time to commit a single-statement transaction. @@ -10630,9 +10790,8 @@ rocksdb::Range get_range(const Rdb_key_def &kd, } } -rocksdb::Range -ha_rocksdb::get_range(const int &i, - uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const { +rocksdb::Range ha_rocksdb::get_range( + const int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const { return myrocks::get_range(*m_key_descr_arr[i], buf); } @@ -10643,11 +10802,10 @@ ha_rocksdb::get_range(const int &i, but in drop_index_thread's case, it means index is marked as removed, so no further seek will happen for the index id. */ -static bool is_myrocks_index_empty( - rocksdb::ColumnFamilyHandle *cfh, const bool is_reverse_cf, - const rocksdb::ReadOptions &read_opts, - const uint index_id) -{ +static bool is_myrocks_index_empty(rocksdb::ColumnFamilyHandle *cfh, + const bool is_reverse_cf, + const rocksdb::ReadOptions &read_opts, + const uint index_id) { bool index_removed = false; uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; rdb_netbuf_store_uint32(key_buf, index_id); @@ -10658,8 +10816,7 @@ static bool is_myrocks_index_empty( if (!it->Valid()) { index_removed = true; } else { - if (memcmp(it->key().data(), key_buf, - Rdb_key_def::INDEX_NUMBER_SIZE)) { + if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE)) { // Key does not have same prefix index_removed = true; } @@ -10687,8 +10844,8 @@ void Rdb_drop_index_thread::run() { timespec ts; clock_gettime(CLOCK_REALTIME, &ts); ts.tv_sec += dict_manager.is_drop_index_empty() - ? 24 * 60 * 60 // no filtering - : 60; // filtering + ? 24 * 60 * 60 // no filtering + : 60; // filtering const auto ret MY_ATTRIBUTE((__unused__)) = mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts); @@ -10704,26 +10861,23 @@ void Rdb_drop_index_thread::run() { if (!indices.empty()) { std::unordered_set finished; rocksdb::ReadOptions read_opts; - read_opts.total_order_seek = true; // disable bloom filter + read_opts.total_order_seek = true; // disable bloom filter for (const auto d : indices) { uint32 cf_flags = 0; if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) { - sql_print_error("RocksDB: Failed to get column family flags " - "from cf id %u. MyRocks data dictionary may " - "get corrupted.", - d.cf_id); + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Failed to get column family flags " + "from cf id %u. MyRocks data dictionary may " + "get corrupted.", + d.cf_id); abort(); } rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id); DBUG_ASSERT(cfh); const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG; - if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) - { - finished.insert(d); - continue; - } uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0, is_reverse_cf ? 0 : 1); @@ -10743,8 +10897,7 @@ void Rdb_drop_index_thread::run() { } rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD); } - if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) - { + if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) { finished.insert(d); } } @@ -10771,28 +10924,24 @@ Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) { } /* - Note: the following function is called when the table is not open. That is, - this->table==nullptr, pk_key_descr==nullptr, etc. + Overload func for delete table ---it deletes table meta data in data + dictionary immediately and delete real data in background thread(async) - tablename points to line in form "./dbname/tablename". + @param tbl IN MyRocks table definition @return HA_EXIT_SUCCESS OK other HA_ERR error code (can be SE-specific) */ - -int ha_rocksdb::delete_table(const char *const tablename) { +int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(tablename != nullptr); + DBUG_ASSERT(tbl != nullptr); + DBUG_ASSERT(m_tbl_def == nullptr || m_tbl_def == tbl); const std::unique_ptr wb = dict_manager.begin(); rocksdb::WriteBatch *const batch = wb.get(); - /* Find the table in the hash */ - Rdb_tbl_def *const tbl = get_table_if_exists(tablename); - if (!tbl) - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch); /* @@ -10807,10 +10956,36 @@ int ha_rocksdb::delete_table(const char *const tablename) { } rdb_drop_idx_thread.signal(); - + // avoid dangling pointer + m_tbl_def = nullptr; DBUG_RETURN(HA_EXIT_SUCCESS); } +/* + Note: the following function is called when the table is not open. That is, + this->table==nullptr, pk_key_descr==nullptr, etc. + + tablename points to line in form "./dbname/tablename". + + @return + HA_EXIT_SUCCESS OK + other HA_ERR error code (can be SE-specific) +*/ + +int ha_rocksdb::delete_table(const char *const tablename) { + DBUG_ENTER_FUNC(); + + DBUG_ASSERT(tablename != nullptr); + + /* Find the table in the hash */ + Rdb_tbl_def *const tbl = get_table_if_exists(tablename); + if (!tbl) { + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + DBUG_RETURN(delete_table(tbl)); +} + int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { const rocksdb::WriteOptions wo = rdb_get_rocksdb_write_options(handler::ha_thd()); @@ -10961,21 +11136,33 @@ int ha_rocksdb::extra(enum ha_extra_function operation) { DBUG_ENTER_FUNC(); switch (operation) { - case HA_EXTRA_KEYREAD: - m_keyread_only = true; - break; - case HA_EXTRA_NO_KEYREAD: - m_keyread_only = false; - break; - case HA_EXTRA_FLUSH: - /* - If the table has blobs, then they are part of m_retrieved_record. - This call invalidates them. - */ - m_retrieved_record.Reset(); - break; - default: - break; + case HA_EXTRA_KEYREAD: + m_keyread_only = true; + break; + case HA_EXTRA_NO_KEYREAD: + m_keyread_only = false; + break; + case HA_EXTRA_FLUSH: + /* + If the table has blobs, then they are part of m_retrieved_record. + This call invalidates them. + */ + m_retrieved_record.Reset(); + break; + case HA_EXTRA_INSERT_WITH_UPDATE: + // INSERT ON DUPLICATE KEY UPDATE + if (rocksdb_enable_insert_with_update_caching) { + m_insert_with_update = true; + } + break; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + // PAIRED with HA_EXTRA_INSERT_WITH_UPDATE or HA_EXTRA_WRITE_CAN_REPLACE + // that indicates the end of REPLACE / INSERT ON DUPLICATE KEY + m_insert_with_update = false; + break; + + default: + break; } DBUG_RETURN(HA_EXIT_SUCCESS); @@ -11046,8 +11233,7 @@ ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key, uint64_t sz = 0; auto disk_size = kd.m_stats.m_actual_disk_size; - if (disk_size == 0) - disk_size = kd.m_stats.m_data_size; + if (disk_size == 0) disk_size = kd.m_stats.m_data_size; auto rows = kd.m_stats.m_rows; if (rows == 0 || disk_size == 0) { rows = 1; @@ -11160,7 +11346,7 @@ static int calculate_stats( // get RocksDB table properties for these ranges rocksdb::TablePropertiesCollection props; - for (auto it : ranges) { + for (const auto &it : ranges) { const auto old_size MY_ATTRIBUTE((__unused__)) = props.size(); const auto status = rdb->GetPropertiesOfTablesInRange( it.first, &it.second[0], it.second.size(), &props); @@ -11350,73 +11536,77 @@ void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc, // The next value can be more complicated if either 'inc' or 'off' is not 1 ulonglong last_val = auto_incr; - // Loop until we can correctly update the atomic value - do { - DBUG_ASSERT(last_val > 0); - // Calculate the next value in the auto increment series: offset - // + N * increment where N is 0, 1, 2, ... - // - // For further information please visit: - // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html - // - // The following is confusing so here is an explanation: - // To get the next number in the sequence above you subtract out the - // offset, calculate the next sequence (N * increment) and then add the - // offset back in. - // - // The additions are rearranged to avoid overflow. The following is - // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact - // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why: - // - // (a+b)/c - // = (a - a%c + a%c + b - b%c + b%c) / c - // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c - // = a/c + b/c + (a%c + b%c) / c - // - // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the - // following statement. - ulonglong n = - (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc; - - // Check if n * inc + off will overflow. This can only happen if we have - // an UNSIGNED BIGINT field. - if (n > (std::numeric_limits::max() - off) / inc) { - DBUG_ASSERT(max_val == std::numeric_limits::max()); - // The 'last_val' value is already equal to or larger than the largest - // value in the sequence. Continuing would wrap around (technically - // the behavior would be undefined). What should we do? - // We could: - // 1) set the new value to the last possible number in our sequence - // as described above. The problem with this is that this - // number could be smaller than a value in an existing row. - // 2) set the new value to the largest possible number. This number - // may not be in our sequence, but it is guaranteed to be equal - // to or larger than any other value already inserted. + if (last_val > max_val) { + new_val = std::numeric_limits::max(); + } else { + // Loop until we can correctly update the atomic value + do { + DBUG_ASSERT(last_val > 0); + // Calculate the next value in the auto increment series: offset + // + N * increment where N is 0, 1, 2, ... // - // For now I'm going to take option 2. + // For further information please visit: + // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html // - // Returning ULLONG_MAX from get_auto_increment will cause the SQL - // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to - // the SE API for get_auto_increment, inserts will fail with - // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but - // inserts will fail with ER_DUP_ENTRY for other types (or no failure - // if the column is in a non-unique SK). - new_val = std::numeric_limits::max(); - auto_incr = new_val; // Store the largest value into auto_incr - break; - } + // The following is confusing so here is an explanation: + // To get the next number in the sequence above you subtract out the + // offset, calculate the next sequence (N * increment) and then add the + // offset back in. + // + // The additions are rearranged to avoid overflow. The following is + // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact + // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why: + // + // (a+b)/c + // = (a - a%c + a%c + b - b%c + b%c) / c + // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c + // = a/c + b/c + (a%c + b%c) / c + // + // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the + // following statement. + ulonglong n = + (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc; + + // Check if n * inc + off will overflow. This can only happen if we have + // an UNSIGNED BIGINT field. + if (n > (std::numeric_limits::max() - off) / inc) { + DBUG_ASSERT(max_val == std::numeric_limits::max()); + // The 'last_val' value is already equal to or larger than the largest + // value in the sequence. Continuing would wrap around (technically + // the behavior would be undefined). What should we do? + // We could: + // 1) set the new value to the last possible number in our sequence + // as described above. The problem with this is that this + // number could be smaller than a value in an existing row. + // 2) set the new value to the largest possible number. This number + // may not be in our sequence, but it is guaranteed to be equal + // to or larger than any other value already inserted. + // + // For now I'm going to take option 2. + // + // Returning ULLONG_MAX from get_auto_increment will cause the SQL + // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to + // the SE API for get_auto_increment, inserts will fail with + // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but + // inserts will fail with ER_DUP_ENTRY for other types (or no failure + // if the column is in a non-unique SK). + new_val = std::numeric_limits::max(); + auto_incr = new_val; // Store the largest value into auto_incr + break; + } - new_val = n * inc + off; + new_val = n * inc + off; - // Attempt to store the new value (plus 1 since m_auto_incr_val contains - // the next available value) into the atomic value. If the current - // value no longer matches what we have in 'last_val' this will fail and - // we will repeat the loop (`last_val` will automatically get updated - // with the current value). - // - // See above explanation for inc == 1 for why we use std::min. - } while (!auto_incr.compare_exchange_weak(last_val, - std::min(new_val + 1, max_val))); + // Attempt to store the new value (plus 1 since m_auto_incr_val contains + // the next available value) into the atomic value. If the current + // value no longer matches what we have in 'last_val' this will fail and + // we will repeat the loop (`last_val` will automatically get updated + // with the current value). + // + // See above explanation for inc == 1 for why we use std::min. + } while (!auto_incr.compare_exchange_weak( + last_val, std::min(new_val + 1, max_val))); + } } *first_value = new_val; @@ -11432,13 +11622,13 @@ const char *dbug_print_item(Item *const item) { char *const buf = dbug_item_print_buf; String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin); str.length(0); - if (!item) - return "(Item*)nullptr"; + if (!item) return "(Item*)nullptr"; item->print(&str, QT_ORDINARY); - if (str.c_ptr() == buf) + if (str.c_ptr() == buf) { return buf; - else + } else { return "Couldn't fit into buffer"; + } } #endif /*DBUG_OFF*/ @@ -11596,7 +11786,6 @@ bool ha_rocksdb::prepare_inplace_alter_table( my_core::Alter_inplace_info::DROP_UNIQUE_INDEX | my_core::Alter_inplace_info::ADD_INDEX | my_core::Alter_inplace_info::ADD_UNIQUE_INDEX)) { - if (has_hidden_pk(altered_table)) { new_n_keys += 1; } @@ -11883,7 +12072,7 @@ int ha_rocksdb::inplace_populate_sk( const int new_packed_size = index->pack_record( new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple, &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0, - nullptr, nullptr, m_ttl_bytes); + nullptr, m_ttl_bytes); const rocksdb::Slice key = rocksdb::Slice( reinterpret_cast(m_sk_packed_tuple), new_packed_size); @@ -11931,9 +12120,9 @@ int ha_rocksdb::inplace_populate_sk( is used inside print_keydup_error so that the error message shows the duplicate record. */ - if (index->unpack_record(new_table_arg, new_table_arg->record[0], - &merge_key, nullptr, - m_verify_row_debug_checksums)) { + if (index->unpack_record( + new_table_arg, new_table_arg->record[0], &merge_key, + &merge_val, m_converter->get_verify_row_debug_checksums())) { /* Should never reach here */ DBUG_ASSERT(0); } @@ -11963,7 +12152,9 @@ int ha_rocksdb::inplace_populate_sk( DBUG_RETURN(res); } - if ((res = tx->finish_bulk_load())) { + bool is_critical_error; + res = tx->finish_bulk_load(&is_critical_error); + if (res && is_critical_error) { // NO_LINT_DEBUG sql_print_error("Error finishing bulk load."); DBUG_RETURN(res); @@ -12173,22 +12364,22 @@ bool ha_rocksdb::commit_inplace_alter_table( #define SHOW_FNAME(name) rocksdb_show_##name -#define DEF_SHOW_FUNC(name, key) \ - static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \ - rocksdb_status_counters.name = \ - rocksdb_stats->getTickerCount(rocksdb::key); \ - var->type = SHOW_LONGLONG; \ - var->value = (char *)&rocksdb_status_counters.name; \ - return HA_EXIT_SUCCESS; \ +#define DEF_SHOW_FUNC(name, key) \ + static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \ + rocksdb_status_counters.name = \ + rocksdb_stats->getTickerCount(rocksdb::key); \ + var->type = SHOW_LONGLONG; \ + var->value = reinterpret_cast(&rocksdb_status_counters.name); \ + return HA_EXIT_SUCCESS; \ } -#define DEF_STATUS_VAR(name) \ +#define DEF_STATUS_VAR(name) \ { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC } -#define DEF_STATUS_VAR_PTR(name, ptr, option) \ +#define DEF_STATUS_VAR_PTR(name, ptr, option) \ { "rocksdb_" name, (char *)ptr, option } -#define DEF_STATUS_VAR_FUNC(name, ptr, option) \ +#define DEF_STATUS_VAR_FUNC(name, ptr, option) \ { name, reinterpret_cast(ptr), option } struct rocksdb_status_counters_t { @@ -12418,9 +12609,8 @@ static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) { var->value = reinterpret_cast(&myrocks_status_variables); } -static ulonglong -io_stall_prop_value(const std::map &props, - const std::string &key) { +static ulonglong io_stall_prop_value( + const std::map &props, const std::string &key) { std::map::const_iterator iter = props.find("io_stalls." + key); if (iter != props.end()) { @@ -12598,6 +12788,10 @@ static SHOW_VAR rocksdb_status_vars[] = { SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other, SHOW_LONGLONG), +#ifndef DBUG_OFF + DEF_STATUS_VAR_PTR("num_get_for_update_calls", + &rocksdb_num_get_for_update_calls, SHOW_LONGLONG), +#endif // the variables generated by SHOW_FUNC are sorted only by prefix (first // arg in the tuple below), so make sure it is unique to make sorting // deterministic as quick sort is not stable @@ -12844,6 +13038,49 @@ bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) { return finished; } +/** + * Locking read + Not Found + Read Committed occurs if we accessed + * a row by Seek, tried to lock it, failed, released and reacquired the + * snapshot (because of READ COMMITTED mode) and the row was deleted by + * someone else in the meantime. + * If so, we either just skipping the row, or re-creating a snapshot + * and seek again. In both cases, Read Committed constraint is not broken. + */ +bool ha_rocksdb::should_skip_invalidated_record(const int rc) { + if ((m_lock_rows != RDB_LOCK_NONE && rc == HA_ERR_KEY_NOT_FOUND && + my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED)) { + return true; + } + return false; +} +/** + * Indicating snapshot needs to be re-created and retrying seek again, + * instead of returning errors or empty set. This is normally applicable + * when hitting kBusy when locking the first row of the transaction, + * with Repeatable Read isolation level. + */ +bool ha_rocksdb::should_recreate_snapshot(const int rc, + const bool is_new_snapshot) { + if (should_skip_invalidated_record(rc) || + (rc == HA_ERR_ROCKSDB_STATUS_BUSY && is_new_snapshot)) { + return true; + } + return false; +} + +/** + * If calling put/delete/singledelete without locking the row, + * it is necessary to pass assume_tracked=false to RocksDB TX API. + * Read Free Replication and Blind Deletes are the cases when + * using TX API and skipping row locking. + */ +bool ha_rocksdb::can_assume_tracked(THD *thd) { + if (use_read_free_rpl() || (THDVAR(thd, blind_delete_primary_key))) { + return false; + } + return true; +} + bool ha_rocksdb::check_bloom_and_set_bounds( THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, const bool use_all_keys, size_t bound_len, uchar *const lower_bound, @@ -12904,20 +13141,22 @@ bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, shorter require all parts of the key to be available for the short key match. */ - if ((use_all_keys && prefix_extractor->InRange(eq_cond)) - || prefix_extractor->SameResultWhenAppended(eq_cond)) + if ((use_all_keys && prefix_extractor->InRange(eq_cond)) || + prefix_extractor->SameResultWhenAppended(eq_cond)) { can_use = true; - else + } else { can_use = false; + } } else { /* if prefix extractor is not defined, all key parts have to be used by eq_cond. */ - if (use_all_keys) + if (use_all_keys) { can_use = true; - else + } else { can_use = false; + } } return can_use; @@ -12936,7 +13175,7 @@ bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; } bool rdb_is_ttl_read_filtering_enabled() { return rocksdb_enable_ttl_read_filtering; } -#ifndef NDEBUG +#ifndef DBUG_OFF int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; } int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; } int rdb_dbug_set_ttl_read_filter_ts() { @@ -12983,17 +13222,17 @@ const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types."); switch (err_type) { - case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT: - return "RDB_IO_ERROR_TX_COMMIT"; - case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT: - return "RDB_IO_ERROR_DICT_COMMIT"; - case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD: - return "RDB_IO_ERROR_BG_THREAD"; - case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL: - return "RDB_IO_ERROR_GENERAL"; - default: - DBUG_ASSERT(false); - return "(unknown)"; + case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT: + return "RDB_IO_ERROR_TX_COMMIT"; + case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT: + return "RDB_IO_ERROR_DICT_COMMIT"; + case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD: + return "RDB_IO_ERROR_BG_THREAD"; + case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL: + return "RDB_IO_ERROR_GENERAL"; + default: + DBUG_ASSERT(false); + return "(unknown)"; } } @@ -13003,32 +13242,37 @@ const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { void rdb_handle_io_error(const rocksdb::Status status, const RDB_IO_ERROR_TYPE err_type) { if (status.IsIOError()) { - switch (err_type) { - case RDB_IO_ERROR_TX_COMMIT: - case RDB_IO_ERROR_DICT_COMMIT: { - rdb_log_status_error(status, "failed to write to WAL"); - /* NO_LINT_DEBUG */ - sql_print_error("MyRocks: aborting on WAL write error."); - abort(); - break; - } - case RDB_IO_ERROR_BG_THREAD: { - rdb_log_status_error(status, "BG thread failed to write to RocksDB"); - /* NO_LINT_DEBUG */ - sql_print_error("MyRocks: aborting on BG write error."); - abort(); - break; - } - case RDB_IO_ERROR_GENERAL: { - rdb_log_status_error(status, "failed on I/O"); - /* NO_LINT_DEBUG */ - sql_print_error("MyRocks: aborting on I/O error."); - abort(); - break; + /* skip dumping core if write failed and we are allowed to do so */ + if (skip_core_dump_on_error) { + opt_core_file = false; } - default: - DBUG_ASSERT(0); - break; + + switch (err_type) { + case RDB_IO_ERROR_TX_COMMIT: + case RDB_IO_ERROR_DICT_COMMIT: { + rdb_log_status_error(status, "failed to write to WAL"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on WAL write error."); + abort(); + break; + } + case RDB_IO_ERROR_BG_THREAD: { + rdb_log_status_error(status, "BG thread failed to write to RocksDB"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on BG write error."); + abort(); + break; + } + case RDB_IO_ERROR_GENERAL: { + rdb_log_status_error(status, "failed on I/O"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on I/O error."); + abort(); + break; + } + default: + DBUG_ASSERT(0); + break; } } else if (status.IsCorruption()) { rdb_log_status_error(status, "data corruption detected!"); @@ -13038,16 +13282,16 @@ void rdb_handle_io_error(const rocksdb::Status status, abort(); } else if (!status.ok()) { switch (err_type) { - case RDB_IO_ERROR_DICT_COMMIT: { - rdb_log_status_error(status, "Failed to write to WAL (dictionary)"); - /* NO_LINT_DEBUG */ - sql_print_error("MyRocks: aborting on WAL write error."); - abort(); - break; - } - default: - rdb_log_status_error(status, "Failed to read/write in RocksDB"); - break; + case RDB_IO_ERROR_DICT_COMMIT: { + rdb_log_status_error(status, "Failed to write to WAL (dictionary)"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on WAL write error."); + abort(); + break; + } + default: + rdb_log_status_error(status, "Failed to read/write in RocksDB"); + break; } } } @@ -13154,9 +13398,10 @@ void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var, if (!s.ok()) { /* NO_LINT_DEBUG */ - sql_print_warning("MyRocks: failed to update delayed_write_rate. " - "status code = %d, status = %s", - s.code(), s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to update delayed_write_rate. " + "status code = %d, status = %s", + s.code(), s.ToString().c_str()); } } RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); @@ -13211,8 +13456,7 @@ int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) { } else if (new_value_type == MYSQL_VALUE_TYPE_INT) { long long intbuf; value->val_int(value, &intbuf); - if (intbuf > 1) - return 1; + if (intbuf > 1) return 1; *return_value = intbuf > 0 ? TRUE : FALSE; } else { return 1; @@ -13231,12 +13475,14 @@ int rocksdb_check_bulk_load( Rdb_transaction *&tx = get_tx_from_thd(thd); if (tx != nullptr) { - const int rc = tx->finish_bulk_load(); - if (rc != 0) { + bool is_critical_error; + const int rc = tx->finish_bulk_load(&is_critical_error); + if (rc != 0 && is_critical_error) { // NO_LINT_DEBUG - sql_print_error("RocksDB: Error %d finalizing last SST file while " - "setting bulk loading variable", - rc); + sql_print_error( + "RocksDB: Error %d finalizing last SST file while " + "setting bulk loading variable", + rc); THDVAR(thd, bulk_load) = 0; return 1; } @@ -13284,9 +13530,10 @@ static void rocksdb_set_max_background_jobs(THD *thd, if (!s.ok()) { /* NO_LINT_DEBUG */ - sql_print_warning("MyRocks: failed to update max_background_jobs. " - "Status code = %d, status = %s.", - s.code(), s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to update max_background_jobs. " + "Status code = %d, status = %s.", + s.code(), s.ToString().c_str()); } } @@ -13312,9 +13559,10 @@ static void rocksdb_set_bytes_per_sync( if (!s.ok()) { /* NO_LINT_DEBUG */ - sql_print_warning("MyRocks: failed to update max_background_jobs. " - "Status code = %d, status = %s.", - s.code(), s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to update max_background_jobs. " + "Status code = %d, status = %s.", + s.code(), s.ToString().c_str()); } } @@ -13340,9 +13588,10 @@ static void rocksdb_set_wal_bytes_per_sync( if (!s.ok()) { /* NO_LINT_DEBUG */ - sql_print_warning("MyRocks: failed to update max_background_jobs. " - "Status code = %d, status = %s.", - s.code(), s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to update max_background_jobs. " + "Status code = %d, status = %s.", + s.code(), s.ToString().c_str()); } } @@ -13369,7 +13618,7 @@ static int rocksdb_validate_set_block_cache_size( } if (new_value < RDB_MIN_BLOCK_CACHE_SIZE || - (uint64_t)new_value > (uint64_t)LONGLONG_MAX) { + (uint64_t)new_value > (uint64_t)LLONG_MAX) { return HA_EXIT_FAILURE; } @@ -13385,17 +13634,19 @@ static int rocksdb_validate_set_block_cache_size( return HA_EXIT_SUCCESS; } -static int -rocksdb_validate_update_cf_options(THD * /* unused */, - struct st_mysql_sys_var * /*unused*/, - void *save, struct st_mysql_value *value) { - +static int rocksdb_validate_update_cf_options( + THD * /* unused */, struct st_mysql_sys_var * /*unused*/, void *save, + struct st_mysql_value *value) { char buff[STRING_BUFFER_USUAL_SIZE]; const char *str; int length; length = sizeof(buff); str = value->val_str(value, buff, &length); - *(const char **)save = str; + // In some cases, str can point to buff in the stack. + // This can cause invalid memory access after validation is finished. + // To avoid this kind case, let's alway duplicate the str if str is not + // nullptr + *(const char **)save = (str == nullptr) ? nullptr : my_strdup(str, MYF(0)); if (str == nullptr) { return HA_EXIT_SUCCESS; @@ -13409,13 +13660,17 @@ rocksdb_validate_update_cf_options(THD * /* unused */, my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str); return HA_EXIT_FAILURE; } + // Loop through option_map and create missing column families + for (Rdb_cf_options::Name_to_config_t::iterator it = option_map.begin(); + it != option_map.end(); ++it) { + cf_manager.get_or_create_cf(rdb, it->first); + } return HA_EXIT_SUCCESS; } -static void -rocksdb_set_update_cf_options(THD *const /* unused */, - struct st_mysql_sys_var *const /* unused */, - void *const var_ptr, const void *const save) { +static void rocksdb_set_update_cf_options( + THD *const /* unused */, struct st_mysql_sys_var *const /* unused */, + void *const var_ptr, const void *const save) { const char *const val = *static_cast(save); RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); @@ -13431,7 +13686,7 @@ rocksdb_set_update_cf_options(THD *const /* unused */, // Reset the pointers regardless of how much success we had with updating // the CF options. This will results in consistent behavior and avoids // dealing with cases when only a subset of CF-s was successfully updated. - *reinterpret_cast(var_ptr) = my_strdup(val, MYF(0)); + *reinterpret_cast(var_ptr) = val; // Do the real work of applying the changes. Rdb_cf_options::Name_to_config_t option_map; @@ -13458,9 +13713,10 @@ rocksdb_set_update_cf_options(THD *const /* unused */, if (s != rocksdb::Status::OK()) { // NO_LINT_DEBUG - sql_print_warning("MyRocks: failed to convert the options for column " - "family '%s' to a map. %s", cf_name.c_str(), - s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to convert the options for column " + "family '%s' to a map. %s", + cf_name.c_str(), s.ToString().c_str()); } else { DBUG_ASSERT(rdb != nullptr); @@ -13469,14 +13725,16 @@ rocksdb_set_update_cf_options(THD *const /* unused */, if (s != rocksdb::Status::OK()) { // NO_LINT_DEBUG - sql_print_warning("MyRocks: failed to apply the options for column " - "family '%s'. %s", cf_name.c_str(), - s.ToString().c_str()); + sql_print_warning( + "MyRocks: failed to apply the options for column " + "family '%s'. %s", + cf_name.c_str(), s.ToString().c_str()); } else { // NO_LINT_DEBUG - sql_print_information("MyRocks: options for column family '%s' " - "have been successfully updated.", - cf_name.c_str()); + sql_print_information( + "MyRocks: options for column family '%s' " + "have been successfully updated.", + cf_name.c_str()); // Make sure that data is internally consistent as well and update // the CF options. This is necessary also to make sure that the CF @@ -13537,18 +13795,33 @@ void ha_rocksdb::rpl_after_update_rows() { DBUG_VOID_RETURN; } +bool ha_rocksdb::is_read_free_rpl_table() const { + return table->s && m_tbl_def->m_is_read_free_rpl_table; +} + /** @brief - Read Free Replication can be used or not. Returning False means - Read Free Replication can be used. Read Free Replication can be used - on UPDATE or DELETE row events, and table must have user defined - primary key. + Read Free Replication can be used or not. Returning true means + Read Free Replication can be used. */ -bool ha_rocksdb::use_read_free_rpl() { +bool ha_rocksdb::use_read_free_rpl() const { DBUG_ENTER_FUNC(); - DBUG_RETURN((m_in_rpl_delete_rows || m_in_rpl_update_rows) && - !has_hidden_pk(table) && m_use_read_free_rpl); + if (!ha_thd()->rli_slave || table->triggers || !is_read_free_rpl_table()) { + DBUG_RETURN(false); + } + + switch (rocksdb_read_free_rpl) { + case read_free_rpl_type::OFF: + DBUG_RETURN(false); + case read_free_rpl_type::PK_ONLY: + DBUG_RETURN(!has_hidden_pk(table) && table->s->keys == 1); + case read_free_rpl_type::PK_SK: + DBUG_RETURN(!has_hidden_pk(table)); + } + + DBUG_ASSERT(false); + DBUG_RETURN(false); } double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { @@ -13575,7 +13848,7 @@ std::string rdb_corruption_marker_file_name() { return ret; } -} // namespace myrocks +} // namespace myrocks /* Register the storage engine plugin outside of myrocks namespace diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index ca141657413..8686b8e3ed9 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -43,7 +43,9 @@ #include "rocksdb/utilities/write_batch_with_index.h" /* MyRocks header files */ +#include "./rdb_buff.h" #include "./rdb_comparator.h" +#include "./rdb_global.h" #include "./rdb_index_merge.h" #include "./rdb_io_watchdog.h" #include "./rdb_perf_context.h" @@ -66,249 +68,19 @@ namespace myrocks { -/* - * class for exporting transaction information for - * information_schema.rocksdb_trx - */ -struct Rdb_trx_info { - std::string name; - ulonglong trx_id; - ulonglong write_count; - ulonglong lock_count; - int timeout_sec; - std::string state; - std::string waiting_key; - ulonglong waiting_cf_id; - int is_replication; - int skip_trx_api; - int read_only; - int deadlock_detect; - int num_ongoing_bulk_load; - ulong thread_id; - std::string query_str; -}; - -std::vector rdb_get_all_trx_info(); - -/* - * class for exporting deadlock transaction information for - * information_schema.rocksdb_deadlock - */ -struct Rdb_deadlock_info { - struct Rdb_dl_trx_info { - ulonglong trx_id; - std::string cf_name; - std::string waiting_key; - bool exclusive_lock; - std::string index_name; - std::string table_name; - }; - std::vector path; - int64_t deadlock_time; - ulonglong victim_trx_id; -}; - -std::vector rdb_get_deadlock_info(); - -/* - This is - - the name of the default Column Family (the CF which stores indexes which - didn't explicitly specify which CF they are in) - - the name used to set the default column family parameter for per-cf - arguments. -*/ -extern const std::string DEFAULT_CF_NAME; - -/* - This is the name of the Column Family used for storing the data dictionary. -*/ -extern const std::string DEFAULT_SYSTEM_CF_NAME; - -/* - This is the name of the hidden primary key for tables with no pk. -*/ -const char *const HIDDEN_PK_NAME = "HIDDEN_PK_ID"; - -/* - Column family name which means "put this index into its own column family". - DEPRECATED!!! -*/ -extern const std::string PER_INDEX_CF_NAME; - -/* - Name for the background thread. -*/ -const char *const BG_THREAD_NAME = "myrocks-bg"; - -/* - Name for the drop index thread. -*/ -const char *const INDEX_THREAD_NAME = "myrocks-index"; - -/* - Name for the manual compaction thread. -*/ -const char *const MANUAL_COMPACTION_THREAD_NAME = "myrocks-mc"; - -/* - Separator between partition name and the qualifier. Sample usage: - - - p0_cfname=foo - - p3_tts_col=bar -*/ -const char RDB_PER_PARTITION_QUALIFIER_NAME_SEP = '_'; - -/* - Separator between qualifier name and value. Sample usage: - - - p0_cfname=foo - - p3_tts_col=bar -*/ -const char RDB_QUALIFIER_VALUE_SEP = '='; - -/* - Separator between multiple qualifier assignments. Sample usage: - - - p0_cfname=foo;p1_cfname=bar;p2_cfname=baz -*/ -const char RDB_QUALIFIER_SEP = ';'; - -/* - Qualifier name for a custom per partition column family. -*/ -const char *const RDB_CF_NAME_QUALIFIER = "cfname"; - -/* - Qualifier name for a custom per partition ttl duration. -*/ -const char *const RDB_TTL_DURATION_QUALIFIER = "ttl_duration"; - -/* - Qualifier name for a custom per partition ttl duration. -*/ -const char *const RDB_TTL_COL_QUALIFIER = "ttl_col"; - -/* - Default, minimal valid, and maximum valid sampling rate values when collecting - statistics about table. -*/ -#define RDB_DEFAULT_TBL_STATS_SAMPLE_PCT 10 -#define RDB_TBL_STATS_SAMPLE_PCT_MIN 1 -#define RDB_TBL_STATS_SAMPLE_PCT_MAX 100 - -/* - Default and maximum values for rocksdb-compaction-sequential-deletes and - rocksdb-compaction-sequential-deletes-window to add basic boundary checking. -*/ -#define DEFAULT_COMPACTION_SEQUENTIAL_DELETES 0 -#define MAX_COMPACTION_SEQUENTIAL_DELETES 2000000 - -#define DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW 0 -#define MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW 2000000 - -/* - Default and maximum values for various compaction and flushing related - options. Numbers are based on the hardware we currently use and our internal - benchmarks which indicate that parallelization helps with the speed of - compactions. - - Ideally of course we'll use heuristic technique to determine the number of - CPU-s and derive the values from there. This however has its own set of - problems and we'll choose simplicity for now. -*/ -#define MAX_BACKGROUND_JOBS 64 - -#define DEFAULT_SUBCOMPACTIONS 1 -#define MAX_SUBCOMPACTIONS 64 - -/* - Default value for rocksdb_sst_mgr_rate_bytes_per_sec = 0 (disabled). -*/ -#define DEFAULT_SST_MGR_RATE_BYTES_PER_SEC 0 - -/* - Defines the field sizes for serializing XID object to a string representation. - string byte format: [field_size: field_value, ...] - [ - 8: XID.formatID, - 1: XID.gtrid_length, - 1: XID.bqual_length, - XID.gtrid_length + XID.bqual_length: XID.data - ] -*/ -#define RDB_FORMATID_SZ 8 -#define RDB_GTRID_SZ 1 -#define RDB_BQUAL_SZ 1 -#define RDB_XIDHDR_LEN (RDB_FORMATID_SZ + RDB_GTRID_SZ + RDB_BQUAL_SZ) - -/* - To fix an unhandled exception we specify the upper bound as LONGLONGMAX - instead of ULONGLONGMAX because the latter is -1 and causes an exception when - cast to jlong (signed) of JNI - - The reason behind the cast issue is the lack of unsigned int support in Java. -*/ -#define MAX_RATE_LIMITER_BYTES_PER_SEC static_cast(LONGLONG_MAX) - -/* - Hidden PK column (for tables with no primary key) is a longlong (aka 8 bytes). - static_assert() in code will validate this assumption. -*/ -#define ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN sizeof(longlong) - -/* - Bytes used to store TTL, in the beginning of all records for tables with TTL - enabled. -*/ -#define ROCKSDB_SIZEOF_TTL_RECORD sizeof(longlong) - -#define ROCKSDB_SIZEOF_AUTOINC_VALUE sizeof(longlong) - -/* - Maximum index prefix length in bytes. -*/ -#define MAX_INDEX_COL_LEN_LARGE 3072 -#define MAX_INDEX_COL_LEN_SMALL 767 +class Rdb_converter; +class Rdb_key_def; +class Rdb_tbl_def; +class Rdb_transaction; +class Rdb_transaction_impl; +class Rdb_writebatch_impl; +class Rdb_field_encoder; -/* - MyRocks specific error codes. NB! Please make sure that you will update - HA_ERR_ROCKSDB_LAST when adding new ones. Also update the strings in - rdb_error_messages to include any new error messages. -*/ -#define HA_ERR_ROCKSDB_FIRST (HA_ERR_LAST + 1) -#define HA_ERR_ROCKSDB_PK_REQUIRED (HA_ERR_ROCKSDB_FIRST + 0) -#define HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED \ - (HA_ERR_ROCKSDB_FIRST + 1) -#define HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED \ - (HA_ERR_ROCKSDB_FIRST + 2) -#define HA_ERR_ROCKSDB_COMMIT_FAILED (HA_ERR_ROCKSDB_FIRST + 3) -#define HA_ERR_ROCKSDB_BULK_LOAD (HA_ERR_ROCKSDB_FIRST + 4) -#define HA_ERR_ROCKSDB_CORRUPT_DATA (HA_ERR_ROCKSDB_FIRST + 5) -#define HA_ERR_ROCKSDB_CHECKSUM_MISMATCH (HA_ERR_ROCKSDB_FIRST + 6) -#define HA_ERR_ROCKSDB_INVALID_TABLE (HA_ERR_ROCKSDB_FIRST + 7) -#define HA_ERR_ROCKSDB_PROPERTIES (HA_ERR_ROCKSDB_FIRST + 8) -#define HA_ERR_ROCKSDB_MERGE_FILE_ERR (HA_ERR_ROCKSDB_FIRST + 9) -/* - Each error code below maps to a RocksDB status code found in: - rocksdb/include/rocksdb/status.h -*/ -#define HA_ERR_ROCKSDB_STATUS_NOT_FOUND (HA_ERR_LAST + 10) -#define HA_ERR_ROCKSDB_STATUS_CORRUPTION (HA_ERR_LAST + 11) -#define HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED (HA_ERR_LAST + 12) -#define HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT (HA_ERR_LAST + 13) -#define HA_ERR_ROCKSDB_STATUS_IO_ERROR (HA_ERR_LAST + 14) -#define HA_ERR_ROCKSDB_STATUS_NO_SPACE (HA_ERR_LAST + 15) -#define HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS (HA_ERR_LAST + 16) -#define HA_ERR_ROCKSDB_STATUS_INCOMPLETE (HA_ERR_LAST + 17) -#define HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS (HA_ERR_LAST + 18) -#define HA_ERR_ROCKSDB_STATUS_TIMED_OUT (HA_ERR_LAST + 19) -#define HA_ERR_ROCKSDB_STATUS_ABORTED (HA_ERR_LAST + 20) -#define HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT (HA_ERR_LAST + 21) -#define HA_ERR_ROCKSDB_STATUS_BUSY (HA_ERR_LAST + 22) -#define HA_ERR_ROCKSDB_STATUS_DEADLOCK (HA_ERR_LAST + 23) -#define HA_ERR_ROCKSDB_STATUS_EXPIRED (HA_ERR_LAST + 24) -#define HA_ERR_ROCKSDB_STATUS_TRY_AGAIN (HA_ERR_LAST + 25) -#define HA_ERR_ROCKSDB_LAST HA_ERR_ROCKSDB_STATUS_TRY_AGAIN +extern char *rocksdb_read_free_rpl_tables; +#if defined(HAVE_PSI_INTERFACE) +extern PSI_rwlock_key key_rwlock_read_free_rpl_tables; +#endif +extern Regex_list_handler rdb_read_free_regex_handler; /** @brief @@ -324,7 +96,7 @@ struct Rdb_table_handler { atomic_stat m_lock_wait_timeout_counter; atomic_stat m_deadlock_counter; - my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock + my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock /* Stores cumulative table statistics */ my_io_perf_atomic_t m_io_perf_read; @@ -338,138 +110,19 @@ struct Rdb_table_handler { uint64_t m_mtcache_last_update; }; -class Rdb_key_def; -class Rdb_tbl_def; -class Rdb_transaction; -class Rdb_transaction_impl; -class Rdb_writebatch_impl; -class Rdb_field_encoder; - -const char *const rocksdb_hton_name = "ROCKSDB"; - -typedef struct _gl_index_id_s { - uint32_t cf_id; - uint32_t index_id; - bool operator==(const struct _gl_index_id_s &other) const { - return cf_id == other.cf_id && index_id == other.index_id; - } - bool operator!=(const struct _gl_index_id_s &other) const { - return cf_id != other.cf_id || index_id != other.index_id; - } - bool operator<(const struct _gl_index_id_s &other) const { - return cf_id < other.cf_id || - (cf_id == other.cf_id && index_id < other.index_id); - } - bool operator<=(const struct _gl_index_id_s &other) const { - return cf_id < other.cf_id || - (cf_id == other.cf_id && index_id <= other.index_id); - } - bool operator>(const struct _gl_index_id_s &other) const { - return cf_id > other.cf_id || - (cf_id == other.cf_id && index_id > other.index_id); - } - bool operator>=(const struct _gl_index_id_s &other) const { - return cf_id > other.cf_id || - (cf_id == other.cf_id && index_id >= other.index_id); - } -} GL_INDEX_ID; - -enum operation_type : int { - ROWS_DELETED = 0, - ROWS_INSERTED, - ROWS_READ, - ROWS_UPDATED, - ROWS_DELETED_BLIND, - ROWS_EXPIRED, - ROWS_FILTERED, - ROWS_HIDDEN_NO_SNAPSHOT, - ROWS_MAX -}; - -enum query_type : int { QUERIES_POINT = 0, QUERIES_RANGE, QUERIES_MAX }; - -#if defined(HAVE_SCHED_GETCPU) -#define RDB_INDEXER get_sched_indexer_t -#else -#define RDB_INDEXER thread_id_indexer_t -#endif - -/* Global statistics struct used inside MyRocks */ -struct st_global_stats { - ib_counter_t rows[ROWS_MAX]; - - // system_rows_ stats are only for system - // tables. They are not counted in rows_* stats. - ib_counter_t system_rows[ROWS_MAX]; - - ib_counter_t queries[QUERIES_MAX]; - - ib_counter_t covered_secondary_key_lookups; -}; - -/* Struct used for exporting status to MySQL */ -struct st_export_stats { - ulonglong rows_deleted; - ulonglong rows_inserted; - ulonglong rows_read; - ulonglong rows_updated; - ulonglong rows_deleted_blind; - ulonglong rows_expired; - ulonglong rows_filtered; - ulonglong rows_hidden_no_snapshot; - - ulonglong system_rows_deleted; - ulonglong system_rows_inserted; - ulonglong system_rows_read; - ulonglong system_rows_updated; - - ulonglong queries_point; - ulonglong queries_range; - - ulonglong covered_secondary_key_lookups; -}; - -/* Struct used for exporting RocksDB memory status */ -struct st_memory_stats { - ulonglong memtable_total; - ulonglong memtable_unflushed; -}; - -/* Struct used for exporting RocksDB IO stalls stats */ -struct st_io_stall_stats { - ulonglong level0_slowdown; - ulonglong level0_slowdown_with_compaction; - ulonglong level0_numfiles; - ulonglong level0_numfiles_with_compaction; - ulonglong stop_for_pending_compaction_bytes; - ulonglong slowdown_for_pending_compaction_bytes; - ulonglong memtable_compaction; - ulonglong memtable_slowdown; - ulonglong total_stop; - ulonglong total_slowdown; - - st_io_stall_stats() - : level0_slowdown(0), level0_slowdown_with_compaction(0), - level0_numfiles(0), level0_numfiles_with_compaction(0), - stop_for_pending_compaction_bytes(0), - slowdown_for_pending_compaction_bytes(0), memtable_compaction(0), - memtable_slowdown(0), total_stop(0), total_slowdown(0) {} -}; - -} // namespace myrocks - -#include "./rdb_buff.h" +} // namespace myrocks /* Provide hash function for GL_INDEX_ID so we can include it in sets */ namespace std { -template <> struct hash { +template <> +struct hash { std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const { const uint64_t val = ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id); return std::hash()(val); } }; -} // namespace std +} // namespace std namespace myrocks { @@ -479,9 +132,9 @@ namespace myrocks { */ class ha_rocksdb : public my_core::handler { - my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock + my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock - Rdb_table_handler *m_table_handler; ///< Open table handler + Rdb_table_handler *m_table_handler; ///< Open table handler /* Iterator used for range scans and for full table/index scans */ rocksdb::Iterator *m_scan_it; @@ -518,11 +171,6 @@ class ha_rocksdb : public my_core::handler { */ mutable bool m_pk_can_be_decoded; - /* - TRUE <=> Some fields in the PK may require unpack_info. - */ - bool m_maybe_unpack_info; - uchar *m_pk_tuple; /* Buffer for storing PK in KeyTupleFormat */ uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */ // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey? @@ -566,10 +214,13 @@ class ha_rocksdb : public my_core::handler { */ uchar *m_pack_buffer; + /* class to convert between Mysql format and RocksDB format*/ + std::shared_ptr m_converter; + /* Pointer to the original TTL timestamp value (8 bytes) during UPDATE. */ - char m_ttl_bytes[ROCKSDB_SIZEOF_TTL_RECORD]; + char *m_ttl_bytes; /* The TTL timestamp value can change if the explicit TTL column is updated. If we detect this when updating the PK, we indicate it here so @@ -580,9 +231,6 @@ class ha_rocksdb : public my_core::handler { /* rowkey of the last record we've read, in StorageFormat. */ String m_last_rowkey; - /* Buffer used by convert_record_to_storage_format() */ - String m_storage_record; - /* Last retrieved record, in table->record[0] data format. @@ -602,15 +250,28 @@ class ha_rocksdb : public my_core::handler { /* TRUE means we are accessing the first row after a snapshot was created */ bool m_rnd_scan_is_new_snapshot; - /* TRUE means the replication slave will use Read Free Replication */ - bool m_use_read_free_rpl; - /* TRUE means we should skip unique key checks for this table if the replication lag gets too large */ bool m_skip_unique_check; + /* + TRUE means INSERT ON DUPLICATE KEY UPDATE. In such case we can optimize by + remember the failed attempt (if there is one that violates uniqueness check) + in write_row and in the following index_read to skip the lock check and read + entirely + */ + bool m_insert_with_update; + + /* TRUE if last time the insertion failed due to duplicated PK */ + bool m_dup_pk_found; + +#ifndef DBUG_OFF + /* Last retreived record for sanity checking */ + String m_dup_pk_retrieved_record; +#endif + /** @brief This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may @@ -627,7 +288,7 @@ class ha_rocksdb : public my_core::handler { /* MySQL index number for duplicate key error */ - int m_dupp_errkey; + uint m_dupp_errkey; int create_key_defs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, @@ -660,11 +321,10 @@ class ha_rocksdb : public my_core::handler { MY_ATTRIBUTE((__nonnull__)); void release_scan_iterator(void); - rocksdb::Status - get_for_update(Rdb_transaction *const tx, - rocksdb::ColumnFamilyHandle *const column_family, - const rocksdb::Slice &key, - rocksdb::PinnableSlice *value) const; + rocksdb::Status get_for_update( + Rdb_transaction *const tx, + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, rocksdb::PinnableSlice *value) const; int get_row_by_rowid(uchar *const buf, const char *const rowid, const uint rowid_size, const bool skip_lookup = false, @@ -689,7 +349,7 @@ class ha_rocksdb : public my_core::handler { int load_hidden_pk_value() MY_ATTRIBUTE((__warn_unused_result__)); int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - bool can_use_single_delete(const uint &index) const + bool can_use_single_delete(const uint index) const MY_ATTRIBUTE((__warn_unused_result__)); bool is_blind_delete_enabled(); bool skip_unique_check() const MY_ATTRIBUTE((__warn_unused_result__)); @@ -704,39 +364,6 @@ class ha_rocksdb : public my_core::handler { void set_last_rowkey(const uchar *const old_data); - /* - Array of table->s->fields elements telling how to store fields in the - record. - */ - Rdb_field_encoder *m_encoder_arr; - - /* Describes instructions on how to decode the field */ - class READ_FIELD { - public: - /* Points to Rdb_field_encoder describing the field */ - Rdb_field_encoder *m_field_enc; - /* if true, decode the field, otherwise skip it */ - bool m_decode; - /* Skip this many bytes before reading (or skipping) this field */ - int m_skip; - }; - - /* - This tells which table fields should be decoded (or skipped) when - decoding table row from (pk, encoded_row) pair. (Secondary keys are - just always decoded in full currently) - */ - std::vector m_decoders_vect; - - /* - This tells if any field which is part of the key needs to be unpacked and - decoded. - */ - bool m_key_requested = false; - - /* Setup field_decoders based on type of scan and table->read_set */ - void setup_read_decoders(); - /* For the active index, indicates which columns must be covered for the current lookup to be covered. If the bitmap field is null, that means this @@ -744,14 +371,6 @@ class ha_rocksdb : public my_core::handler { */ MY_BITMAP m_lookup_bitmap = {nullptr, 0, 0, nullptr, nullptr}; - /* - Number of bytes in on-disk (storage) record format that are used for - storing SQL NULL flags. - */ - uint m_null_bytes_in_rec; - - void get_storage_type(Rdb_field_encoder *const encoder, const uint &kp); - void setup_field_converters(); int alloc_key_buffers(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg, bool alloc_alter_buffers = false) @@ -759,25 +378,19 @@ class ha_rocksdb : public my_core::handler { void free_key_buffers(); // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE - rocksdb::Range get_range(const int &i, uchar buf[]) const; + rocksdb::Range get_range(const int i, uchar buf[]) const; /* Perf timers for data reads */ Rdb_io_perf m_io_perf; - /* - A counter of how many row checksums were checked for this table. Note that - this does not include checksums for secondary index entries. - */ - my_core::ha_rows m_row_checksums_checked; - /* Update stats */ void update_stats(void); -public: + public: /* Controls whether writes include checksums. This is updated from the session variable @@ -785,19 +398,19 @@ public: */ bool m_store_row_debug_checksums; - /* Same as above but for verifying checksums when reading */ - bool m_verify_row_debug_checksums; int m_checksums_pct; ha_rocksdb(my_core::handlerton *const hton, my_core::TABLE_SHARE *const table_arg); - ~ha_rocksdb() { + virtual ~ha_rocksdb() override { int err MY_ATTRIBUTE((__unused__)); err = finalize_bulk_load(false); if (err != 0) { - sql_print_error("RocksDB: Error %d finalizing bulk load while closing " - "handler.", - err); + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Error %d finalizing bulk load while closing " + "handler.", + err); } } @@ -883,21 +496,6 @@ public: int rename_table(const char *const from, const char *const to) override MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int convert_blob_from_storage_format(my_core::Field_blob *const blob, - Rdb_string_reader *const reader, - bool decode) - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - - int convert_varchar_from_storage_format( - my_core::Field_varstring *const field_var, - Rdb_string_reader *const reader, bool decode) - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - - int convert_field_from_storage_format(my_core::Field *const field, - Rdb_string_reader *const reader, - bool decode, uint len) - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int convert_record_from_storage_format(const rocksdb::Slice *const key, const rocksdb::Slice *const value, uchar *const buf) @@ -910,10 +508,9 @@ public: static const std::vector parse_into_tokens(const std::string &s, const char delim); - static const std::string generate_cf_name(const uint index, - const TABLE *const table_arg, - const Rdb_tbl_def *const tbl_def_arg, - bool *per_part_match_found); + static const std::string generate_cf_name( + const uint index, const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found); static const char *get_key_name(const uint index, const TABLE *const table_arg, @@ -1055,7 +652,7 @@ public: /* Default implementation from cancel_pushed_idx_cond() suits us */ -private: + private: struct key_def_cf_info { rocksdb::ColumnFamilyHandle *cf_handle; bool is_reverse_cf; @@ -1075,16 +672,6 @@ private: longlong hidden_pk_id; bool skip_unique_check; - - // In certain cases, TTL is enabled on a table, as well as an explicit TTL - // column. The TTL column can be part of either the key or the value part - // of the record. If it is part of the key, we store the offset here. - // - // Later on, we use this offset to store the TTL in the value part of the - // record, which we can then access in the compaction filter. - // - // Set to UINT_MAX by default to indicate that the TTL is not in key. - uint ttl_pk_offset = UINT_MAX; }; /* @@ -1117,23 +704,24 @@ private: std::array *const cfs) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int create_key_def(const TABLE *const table_arg, const uint &i, + int create_key_def(const TABLE *const table_arg, const uint i, const Rdb_tbl_def *const tbl_def_arg, std::shared_ptr *const new_key_def, - const struct key_def_cf_info &cf_info) const + const struct key_def_cf_info &cf_info, uint64 ttl_duration, + const std::string &ttl_column) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int create_inplace_key_defs( const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg, const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg, - const std::array &cfs) const + const std::array &cf, + uint64 ttl_duration, const std::string &ttl_column) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - std::unordered_map - get_old_key_positions(const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg, - const TABLE *old_table_arg, - const Rdb_tbl_def *old_tbl_def_arg) const + std::unordered_map get_old_key_positions( + const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg, + const TABLE *old_table_arg, const Rdb_tbl_def *old_tbl_def_arg) const MY_ATTRIBUTE((__nonnull__)); int compare_key_parts(const KEY *const old_key, @@ -1143,17 +731,13 @@ private: int compare_keys(const KEY *const old_key, const KEY *const new_key) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int convert_record_to_storage_format(const struct update_row_info &row_info, - rocksdb::Slice *const packed_rec) - MY_ATTRIBUTE((__nonnull__)); - bool should_hide_ttl_rec(const Rdb_key_def &kd, const rocksdb::Slice &ttl_rec_val, const int64_t curr_ts) MY_ATTRIBUTE((__warn_unused_result__)); - void rocksdb_skip_expired_records(const Rdb_key_def &kd, - rocksdb::Iterator *const iter, - bool seek_backward); + int rocksdb_skip_expired_records(const Rdb_key_def &kd, + rocksdb::Iterator *const iter, + bool seek_backward); int index_first_intern(uchar *buf) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1161,7 +745,7 @@ private: MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); enum icp_result check_index_cond() const; - int find_icp_matching_index_rec(const bool &move_forward, uchar *const buf) + int find_icp_matching_index_rec(const bool move_forward, uchar *const buf) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); void calc_updated_indexes(); @@ -1169,20 +753,20 @@ private: const bool skip_unique_check) MY_ATTRIBUTE((__warn_unused_result__)); int get_pk_for_update(struct update_row_info *const row_info); - int check_and_lock_unique_pk(const uint &key_id, + int check_and_lock_unique_pk(const uint key_id, const struct update_row_info &row_info, - bool *const found, bool *const pk_changed) + bool *const found) MY_ATTRIBUTE((__warn_unused_result__)); - int check_and_lock_sk(const uint &key_id, + int check_and_lock_sk(const uint key_id, const struct update_row_info &row_info, bool *const found) MY_ATTRIBUTE((__warn_unused_result__)); int check_uniqueness_and_lock(const struct update_row_info &row_info, - bool *const pk_changed) + bool pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); bool over_bulk_load_threshold(int *err) MY_ATTRIBUTE((__warn_unused_result__)); - int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &index, + int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &key_def, const rocksdb::Slice *key, struct unique_sk_buf_info *sk_info) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1191,32 +775,36 @@ private: bool sort) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); void update_bytes_written(ulonglong bytes_written); - int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info, - const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); - int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info, const bool bulk_load_sk) + int update_write_pk(const Rdb_key_def &kd, + const struct update_row_info &row_info, + const bool pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); - int update_indexes(const struct update_row_info &row_info, - const bool &pk_changed) + int update_write_sk(const TABLE *const table_arg, const Rdb_key_def &kd, + const struct update_row_info &row_info, + const bool bulk_load_sk) + MY_ATTRIBUTE((__warn_unused_result__)); + int update_write_indexes(const struct update_row_info &row_info, + const bool pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter, - const bool &using_full_key, - const rocksdb::Slice &key_slice, + const bool using_full_key, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int read_before_key(const Rdb_key_def &kd, const bool &using_full_key, + int read_before_key(const Rdb_key_def &kd, const bool using_full_key, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); - int position_to_correct_key( - const Rdb_key_def &kd, const enum ha_rkey_function &find_flag, - const bool &full_key_match, const uchar *const key, - const key_part_map &keypart_map, const rocksdb::Slice &key_slice, - bool *const move_forward, const int64_t ttl_filter_ts) + int position_to_correct_key(const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const bool full_key_match, const uchar *const key, + const key_part_map &keypart_map, + const rocksdb::Slice &key_slice, + bool *const move_forward, + const int64_t ttl_filter_ts) MY_ATTRIBUTE((__warn_unused_result__)); int read_row_from_primary_key(uchar *const buf) @@ -1228,7 +816,7 @@ private: int calc_eq_cond_len(const Rdb_key_def &kd, const enum ha_rkey_function &find_flag, const rocksdb::Slice &slice, - const int &bytes_changed_by_succ, + const int bytes_changed_by_succ, const key_range *const end_key, uint *const end_key_packed_size) MY_ATTRIBUTE((__warn_unused_result__)); @@ -1250,6 +838,10 @@ private: int calculate_stats_for_table() MY_ATTRIBUTE((__warn_unused_result__)); + bool should_skip_invalidated_record(const int rc); + bool should_recreate_snapshot(const int rc, const bool is_new_snapshot); + bool can_assume_tracked(THD *thd); + public: int index_init(uint idx, bool sorted) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1304,11 +896,15 @@ private: ha_rows records_in_range(uint inx, key_range *const min_key, key_range *const max_key) override MY_ATTRIBUTE((__warn_unused_result__)); + + int delete_table(Rdb_tbl_def *const tbl); int delete_table(const char *const from) override MY_ATTRIBUTE((__warn_unused_result__)); int create(const char *const name, TABLE *const form, HA_CREATE_INFO *const create_info) override MY_ATTRIBUTE((__warn_unused_result__)); + int create_table(const std::string &table_name, const TABLE *table_arg, + ulonglong auto_increment_value); bool check_if_incompatible_data(HA_CREATE_INFO *const info, uint table_changes) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1356,22 +952,21 @@ private: TABLE *const altered_table, my_core::Alter_inplace_info *const ha_alter_info) override; - bool - commit_inplace_alter_table(TABLE *const altered_table, - my_core::Alter_inplace_info *const ha_alter_info, - bool commit) override; + bool commit_inplace_alter_table( + TABLE *const altered_table, + my_core::Alter_inplace_info *const ha_alter_info, bool commit) override; - void set_use_read_free_rpl(const char *const whitelist); void set_skip_unique_check_tables(const char *const whitelist); + bool is_read_free_rpl_table() const; -public: + public: virtual void rpl_before_delete_rows() override; virtual void rpl_after_delete_rows() override; virtual void rpl_before_update_rows() override; virtual void rpl_after_update_rows() override; - virtual bool use_read_free_rpl() override; + virtual bool use_read_free_rpl() const override; -private: + private: /* Flags tracking if we are inside different replication operation */ bool m_in_rpl_delete_rows; bool m_in_rpl_update_rows; @@ -1421,16 +1016,21 @@ struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx { std::unordered_set> added_indexes, std::unordered_set dropped_index_ids, uint n_added_keys, uint n_dropped_keys, ulonglong max_auto_incr) - : my_core::inplace_alter_handler_ctx(), m_new_tdef(new_tdef), - m_old_key_descr(old_key_descr), m_new_key_descr(new_key_descr), - m_old_n_keys(old_n_keys), m_new_n_keys(new_n_keys), - m_added_indexes(added_indexes), m_dropped_index_ids(dropped_index_ids), - m_n_added_keys(n_added_keys), m_n_dropped_keys(n_dropped_keys), + : my_core::inplace_alter_handler_ctx(), + m_new_tdef(new_tdef), + m_old_key_descr(old_key_descr), + m_new_key_descr(new_key_descr), + m_old_n_keys(old_n_keys), + m_new_n_keys(new_n_keys), + m_added_indexes(added_indexes), + m_dropped_index_ids(dropped_index_ids), + m_n_added_keys(n_added_keys), + m_n_dropped_keys(n_dropped_keys), m_max_auto_incr(max_auto_incr) {} ~Rdb_inplace_alter_ctx() {} -private: + private: /* Disable Copying */ Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &); Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &); @@ -1439,4 +1039,4 @@ private: // file name indicating RocksDB data corruption std::string rdb_corruption_marker_file_name(); -} // namespace myrocks +} // namespace myrocks diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h index a2d014f2f01..4419c9b9a8a 100644 --- a/storage/rocksdb/ha_rocksdb_proto.h +++ b/storage/rocksdb/ha_rocksdb_proto.h @@ -77,7 +77,7 @@ Rdb_cf_manager &rdb_get_cf_manager(); const rocksdb::BlockBasedTableOptions &rdb_get_table_options(); bool rdb_is_ttl_enabled(); bool rdb_is_ttl_read_filtering_enabled(); -#ifndef NDEBUG +#ifndef DBUG_OFF int rdb_dbug_set_ttl_rec_ts(); int rdb_dbug_set_ttl_snapshot_ts(); int rdb_dbug_set_ttl_read_filter_ts(); @@ -99,4 +99,4 @@ Rdb_ddl_manager *rdb_get_ddl_manager(void) class Rdb_binlog_manager; Rdb_binlog_manager *rdb_get_binlog_manager(void) MY_ATTRIBUTE((__warn_unused_result__)); -} // namespace myrocks +} // namespace myrocks diff --git a/storage/rocksdb/logger.h b/storage/rocksdb/logger.h index ca75caf9df5..f20f4474e87 100644 --- a/storage/rocksdb/logger.h +++ b/storage/rocksdb/logger.h @@ -22,7 +22,7 @@ namespace myrocks { class Rdb_logger : public rocksdb::Logger { -public: + public: explicit Rdb_logger(const rocksdb::InfoLogLevel log_level = rocksdb::InfoLogLevel::ERROR_LEVEL) : m_mysql_log_level(log_level) {} @@ -77,9 +77,9 @@ public: m_mysql_log_level = log_level; } -private: + private: std::shared_ptr m_logger; rocksdb::InfoLogLevel m_mysql_log_level; }; -} // namespace myrocks +} // namespace myrocks diff --git a/storage/rocksdb/myrocks_hotbackup b/storage/rocksdb/myrocks_hotbackup index cb10bb902c0..b0a06c03a03 100755 --- a/storage/rocksdb/myrocks_hotbackup +++ b/storage/rocksdb/myrocks_hotbackup @@ -45,12 +45,14 @@ class Writer(object): class StreamWriter(Writer): stream_cmd= '' - def __init__(self, stream_option): + def __init__(self, stream_option, direct = 0): super(StreamWriter, self).__init__() if stream_option == 'tar': self.stream_cmd= 'tar chf -' elif stream_option == 'xbstream': self.stream_cmd= 'xbstream -c' + if direct: + self.stream_cmd = self.stream_cmd + ' -d' else: raise Exception("Only tar or xbstream is supported as streaming option.") @@ -342,6 +344,13 @@ class MySQLUtil: row = cur.fetchone() return row[0] + @staticmethod + def is_directio_enabled(dbh): + sql = "SELECT @@global.rocksdb_use_direct_reads" + cur = dbh.cursor() + cur.execute(sql) + row = cur.fetchone() + return row[0] class BackupRunner: datadir = None @@ -363,9 +372,7 @@ class BackupRunner: try: signal.signal(signal.SIGINT, signal_handler) w = None - if opts.output_stream: - w = StreamWriter(opts.output_stream) - else: + if not opts.output_stream: raise Exception("Currently only streaming backup is supported.") snapshot_dir = opts.checkpoint_directory + '/' + str(backup_round) @@ -373,6 +380,11 @@ class BackupRunner: opts.mysql_password, opts.mysql_port, opts.mysql_socket) + direct = MySQLUtil.is_directio_enabled(dbh) + logger.info("Direct I/O: %d", direct) + + w = StreamWriter(opts.output_stream, direct) + if not self.datadir: self.datadir = MySQLUtil.get_datadir(dbh) logger.info("Set datadir: %s", self.datadir) diff --git a/storage/rocksdb/mysql-test/rocksdb/combinations b/storage/rocksdb/mysql-test/rocksdb/combinations index c3f6b9d0396..b7316c71485 100644 --- a/storage/rocksdb/mysql-test/rocksdb/combinations +++ b/storage/rocksdb/mysql-test/rocksdb/combinations @@ -3,4 +3,3 @@ rocksdb_write_policy=write_committed [write_prepared] rocksdb_write_policy=write_prepared -rocksdb_commit_time_batch_for_recovery=on diff --git a/storage/rocksdb/mysql-test/rocksdb/include/bulk_load.inc b/storage/rocksdb/mysql-test/rocksdb/include/bulk_load.inc index 239ec74169a..1b79825e507 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/bulk_load.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/bulk_load.inc @@ -89,20 +89,32 @@ EOF # Make sure a snapshot held by another user doesn't block the bulk load connect (other,localhost,root,,); set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; + +# Assert that there is a pending snapshot +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; connection default; + +# Update CF to smaller value to create multiple SST in ingestion +eval SET @@GLOBAL.ROCKSDB_UPDATE_CF_OPTIONS= + '$pk_cf_name={write_buffer_size=8m;target_file_size_base=1m};'; + set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; --disable_query_log --echo LOAD DATA INFILE INTO TABLE t1; eval LOAD DATA INFILE '$file' INTO TABLE t1; +# There should be no SST being ingested +select * from t1; --echo LOAD DATA INFILE INTO TABLE t2; eval LOAD DATA INFILE '$file' INTO TABLE t2; +# There should be no SST being ingested +select * from t2; --echo LOAD DATA INFILE INTO TABLE t3; eval LOAD DATA INFILE '$file' INTO TABLE t3; +# There should be no SST being ingested +select * from t3; --enable_query_log set rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/bulk_load_unsorted.inc b/storage/rocksdb/mysql-test/rocksdb/include/bulk_load_unsorted.inc index 84a9d8c578e..5cdc76a32d4 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/bulk_load_unsorted.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/bulk_load_unsorted.inc @@ -98,9 +98,10 @@ EOF # Make sure a snapshot held by another user doesn't block the bulk load connect (other,localhost,root,,); set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; + +# Assert that there is a pending snapshot +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; connection default; set rocksdb_bulk_load=1; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/bypass_create_table.inc b/storage/rocksdb/mysql-test/rocksdb/include/bypass_create_table.inc new file mode 100644 index 00000000000..233635b369e --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/bypass_create_table.inc @@ -0,0 +1,298 @@ +CREATE TABLE `link_table` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0' , + `id1_type` int(10) unsigned NOT NULL DEFAULT '0' , + `id2` bigint(20) unsigned NOT NULL DEFAULT '0' , + `id2_type` int(10) unsigned NOT NULL DEFAULT '0' , + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , + `visibility` tinyint(3) NOT NULL DEFAULT '0' , + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , + `time` int(10) unsigned NOT NULL DEFAULT '0' , + `version` bigint(20) unsigned NOT NULL DEFAULT '0' , + PRIMARY KEY (`link_type` , `id1` , `id2`) COMMENT 'cf_link' , + KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , + `version` , `data`) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +CREATE TABLE `link_table2` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0' , + `id1_type` int(10) unsigned NOT NULL DEFAULT '0' , + `id2` bigint(20) unsigned NOT NULL DEFAULT '0' , + `id2_type` int(10) unsigned NOT NULL DEFAULT '0' , + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , + `visibility` tinyint(3) NOT NULL DEFAULT '0' , + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , + `time` int(10) unsigned NOT NULL DEFAULT '0' , + `version` bigint(20) unsigned NOT NULL DEFAULT '0' , + PRIMARY KEY (`link_type` , `id1` , `id2`) + COMMENT 'cf_link' , + KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , + `version` , `data`) COMMENT 'cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=9; + +insert into link_table values (1, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (1, 1, 2, 2, 3, 3, 'a10', 10, 125); +insert into link_table values (1, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 5, 2, 3, 3, 'a12', 12, 125); +insert into link_table values (1, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (1, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (1, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 4, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 4, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 4, 4, 'a15', 15, 125); +insert into link_table values (3, 1, 10, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 9, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 8, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 7, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 4, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 3, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (3, 1, 2, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (3, 1, 1, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (9, 1, 9, 2, 5, 6, '0 ', 10, 125); +insert into link_table values (9, 1, 8, 2, 5, 6, '01 ', 11, 125); +insert into link_table values (9, 1, 7, 2, 5, 6, '012 ', 11, 125); +insert into link_table values (9, 1, 6, 2, 5, 6, '0123 ', 12, 125); +insert into link_table values (9, 1, 5, 2, 5, 6, '01234 ', 12, 125); +insert into link_table values (9, 1, 4, 2, 5, 6, '012345 ', 12, 125); +insert into link_table values (9, 1, 3, 2, 5, 6, '0123456 ', 13, 125); +insert into link_table values (9, 1, 2, 2, 5, 6, '01234567 ', 14, 125); +insert into link_table values (9, 1, 1, 2, 5, 6, '012345678 ', 15, 125); +insert into link_table values (9, 1, 0, 2, 5, 6, '0123456789 ', 15, 125); + +insert into link_table2 select * from link_table; + +CREATE TABLE `id_table` ( + `id` bigint(20) NOT NULL DEFAULT '0', + `type` int(11) NOT NULL DEFAULT '0', + `row_created_time` int(11) NOT NULL DEFAULT '0', + `hash_key` varchar(255) NOT NULL DEFAULT '', + `is_deleted` tinyint(4) DEFAULT '0', + PRIMARY KEY (`id`), + KEY `type_id` (`type`,`id`) +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 ROW_FORMAT=COMPRESSED +KEY_BLOCK_SIZE=8; + +insert into id_table values (1, 1, 10, '111', 0); +insert into id_table values (2, 1, 10, '111', 1); +insert into id_table values (3, 1, 10, '111', 0); +insert into id_table values (4, 1, 10, '111', 1); +insert into id_table values (5, 1, 10, '111', 0); +insert into id_table values (6, 1, 10, '111', 1); +insert into id_table values (7, 1, 10, '111', 0); +insert into id_table values (8, 1, 10, '111', 1); +insert into id_table values (9, 1, 10, '111', 0); +insert into id_table values (10, 1, 10, '111', 1); + +CREATE TABLE `node_table` ( + `id` bigint(20) unsigned NOT NULL DEFAULT '0', + `type` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + `update_time` int(10) unsigned NOT NULL DEFAULT '0', + `data` mediumtext COLLATE latin1_bin NOT NULL, + PRIMARY KEY (`type`,`id`) COMMENT 'cf_node_type_id', + KEY `id` (`id`) COMMENT 'cf_node' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +insert into node_table values (1, 1, 1, 10, 'data'); + +insert into node_table values (2, 1, 1, 10, 'data'); + +insert into node_table values (3, 1, 1, 10, 'data'); + +insert into node_table values (4, 1, 1, 10, 'data'); + +insert into node_table values (5, 1, 1, 10, 'data'); + +insert into node_table values (6, 1, 1, 10, 'data'); + +insert into node_table values (7, 1, 1, 10, 'data'); + +insert into node_table values (8, 1, 1, 10, 'data'); + +insert into node_table values (9, 1, 1, 10, 'data'); + +insert into node_table values (10, 1, 1, 10, 'data'); + +CREATE TABLE `count_table` ( + `id` bigint(20) unsigned NOT NULL DEFAULT '0', + `type` int(10) unsigned NOT NULL DEFAULT '0', + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `count` int(10) unsigned NOT NULL DEFAULT '0', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`id`,`link_type`) COMMENT 'cf_count_table' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + + +insert into count_table values (2, 1, 1, 1, 10, 20); + +insert into count_table values (3, 1, 1, 1, 10, 20); + +insert into count_table values (4, 1, 1, 1, 10, 20); + +insert into count_table values (5, 1, 1, 1, 10, 20); + +insert into count_table values (6, 1, 1, 1, 10, 20); + +insert into count_table values (7, 1, 1, 1, 10, 20); + +insert into count_table values (8, 1, 1, 1, 10, 20); + +insert into count_table values (9, 1, 1, 1, 10, 20); + +insert into count_table values (10, 1, 1, 1, 10, 20); + +CREATE TABLE `link_table5` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +insert into link_table5 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table5 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table5 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table5 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table5 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table5 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table5 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); + + +CREATE TABLE `link_table3` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(4) NOT NULL DEFAULT '0', + `data` text COLLATE latin1_bin NOT NULL, + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', + KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`) + COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; + +insert into link_table3 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table3 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table3 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table3 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table3 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table3 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table3 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); + +CREATE TABLE `link_table6` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(4) NOT NULL DEFAULT '0', + `data` text COLLATE latin1_bin NOT NULL, + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', + KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`, + `data`(255)) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; + +insert into link_table6 values (1, 1, 2, 2, 1, 1, + 'data12_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 3, 2, 1, 2, + 'data13_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 4, 2, 1, 2, + 'data14_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 5, 2, 1, 1, + 'data15_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 1, 2, 1, 1, + 'data21_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 2, 2, 1, 1, + 'data22_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 3, 2, 1, 1, + 'data32_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); + +CREATE TABLE `link_table4` ( + `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', + `raw_key` text COLLATE latin1_bin, + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `link_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', + KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`,`data`) + COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +insert into link_table4 values ('a1', "rk1", 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table4 values ('a1', "rk2", 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table4 values ('a1', "rk3", 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table4 values ('a1', "rk4", 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table4 values ('b1', "rk5", 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table4 values ('b1', "rk6", 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table4 values ('b1', "rk7", 3, 2, 1, 1, 'data32', 1, 1); diff --git a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case3.inc b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case3.inc index c23717c4fda..078ab5e24cb 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case3.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case3.inc @@ -52,8 +52,9 @@ let $ID = `SELECT connection_id()`; send SELECT * FROM t0 WHERE value > 0 FOR UPDATE; connection con2; -let $wait_condition = SELECT 1 FROM information_schema.processlist - WHERE id = $ID AND state = "Sending data"; +let $wait_condition = + SELECT 1 FROM information_schema.processlist + WHERE (id = $ID OR srv_id = $ID) AND state = "Sending data"; --source include/wait_condition.inc eval SET SESSION TRANSACTION ISOLATION LEVEL $isolation_level; UPDATE t0 SET VALUE=VALUE+1 WHERE id=190000; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case4.inc b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case4.inc index da80f796750..9c471af9130 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case4.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case4.inc @@ -52,8 +52,9 @@ let $ID = `SELECT connection_id()`; send SELECT * FROM t0 WHERE value > 0 FOR UPDATE; connection con2; -let $wait_condition = SELECT 1 FROM information_schema.processlist - WHERE id = $ID AND state = "Sending data"; +let $wait_condition = + SELECT 1 FROM information_schema.processlist + WHERE (id = $ID OR srv_id = $ID) AND state = "Sending data"; --source include/wait_condition.inc eval SET SESSION TRANSACTION ISOLATION LEVEL $isolation_level; INSERT INTO t0 VALUES(200001,1), (-1,1); diff --git a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case5.inc b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case5.inc index b77a54e4360..3f8ab905226 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case5.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case5.inc @@ -55,8 +55,9 @@ let $ID = `SELECT connection_id()`; send SELECT * FROM t0 WHERE value > 0 FOR UPDATE; connection con2; -let $wait_condition = SELECT 1 FROM information_schema.processlist - WHERE id = $ID AND state = "Sending data"; +let $wait_condition = + SELECT 1 FROM information_schema.processlist + WHERE (id = $ID OR srv_id = $ID) AND state = "Sending data"; --source include/wait_condition.inc eval SET SESSION TRANSACTION ISOLATION LEVEL $isolation_level; BEGIN; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case6.inc b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case6.inc index 9494146ba5c..54d4a0052ea 100644 --- a/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case6.inc +++ b/storage/rocksdb/mysql-test/rocksdb/include/locking_issues_case6.inc @@ -55,8 +55,9 @@ let $ID = `SELECT connection_id()`; send SELECT * FROM t0 WHERE value > 0 FOR UPDATE; connection con2; -let $wait_condition = SELECT 1 FROM information_schema.processlist - WHERE id = $ID AND state = "Sending data"; +let $wait_condition = + SELECT 1 FROM information_schema.processlist + WHERE (id = $ID OR srv_id = $ID) AND state = "Sending data"; --source include/wait_condition.inc eval SET SESSION TRANSACTION ISOLATION LEVEL $isolation_level; BEGIN; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/rocksdb_concurrent_delete.inc b/storage/rocksdb/mysql-test/rocksdb/include/rocksdb_concurrent_delete.inc deleted file mode 100644 index 71e713226d7..00000000000 --- a/storage/rocksdb/mysql-test/rocksdb/include/rocksdb_concurrent_delete.inc +++ /dev/null @@ -1,53 +0,0 @@ -# Usage: -# -# let $order = ASC; # or DESC -# let $comment = "rev:cf2"; # or "" -# --source suite/rocksdb/include/rocksdb_concurrent_delete.inc - -let $first_row = -1; # Error this should never happen -if ($order == 'ASC') -{ - let $first_row = 1; -} -if ($order == 'DESC') -{ - let $first_row = 3; -} - -connect (con, localhost, root,,); -connection default; - ---disable_warnings -SET debug_sync='RESET'; -DROP TABLE IF EXISTS t1; ---enable_warnings - -eval CREATE TABLE t1 (pk INT PRIMARY KEY COMMENT $comment, a INT); -INSERT INTO t1 VALUES(1,1), (2,2), (3,3); - -# This will cause the SELECT to block after finding the first row, but -# before locking and reading it. -connection con; -SET debug_sync='rocksdb_concurrent_delete SIGNAL parked WAIT_FOR go'; -send_eval SELECT * FROM t1 order by t1.pk $order FOR UPDATE; - -# While that connection is waiting, delete the first row (the one con -# is about to lock and read -connection default; -SET debug_sync='now WAIT_FOR parked'; -eval DELETE FROM t1 WHERE pk = $first_row; - -# Signal the waiting select to continue -SET debug_sync='now SIGNAL go'; - -# Now get the results from the select. The first entry (1,1) (or (3,3) when -# using reverse ordering) should be missing. Prior to the fix the SELECT -# would have returned: "1815: Internal error: NotFound:" -connection con; -reap; - -# Cleanup -connection default; -disconnect con; -set debug_sync='RESET'; -drop table t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/use_direct_io_option.inc b/storage/rocksdb/mysql-test/rocksdb/include/use_direct_io_option.inc new file mode 100644 index 00000000000..da16e1c9c3b --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/use_direct_io_option.inc @@ -0,0 +1,23 @@ +# Common test pattern for options that control direct i/o +# +# Required input: +# $io_option - name and assignment to enable on server command line + +--source include/have_direct_io.inc + +--echo Checking direct reads +--let $_mysqld_option=$io_option +--source include/restart_mysqld_with_option.inc + +CREATE TABLE t1 (pk INT PRIMARY KEY DEFAULT '0', a INT(11), b CHAR(8)) ENGINE=rocksdb; +SHOW CREATE TABLE t1; +INSERT INTO t1 VALUES (1, 1,'a'); +INSERT INTO t1 (a,b) VALUES (2,'b'); +set global rocksdb_force_flush_memtable_now=1; +--sorted_result +SELECT a,b FROM t1; +DROP TABLE t1; + +# cleanup +--let _$mysqld_option= +--source include/restart_mysqld.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result index 97310071ef2..e2ef3cbfaf0 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result @@ -15,7 +15,7 @@ count(b) 3000000 ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; -ERROR HY000: Got error 10 'Operation aborted: Failed to acquire lock due to max_num_locks limit' from ROCKSDB +ERROR HY000: Got error 10 'Operation aborted: Failed to acquire lock due to rocksdb_max_row_locks limit' from ROCKSDB set session rocksdb_bulk_load=1; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; set session rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result index 17e6bedb882..e5af3c18d35 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result @@ -159,3 +159,24 @@ INSERT INTO t1 (a) VALUES (1); UPDATE t1 SET pk = 3; ALTER TABLE t1 AUTO_INCREMENT 2; DROP TABLE t1; +#---------------------------------- +# Issue #902 Debug assert in autoincrement with small field type +#---------------------------------- +SET auto_increment_increment=100, auto_increment_offset=10; +CREATE TABLE t1(i INT AUTO_INCREMENT PRIMARY KEY) ENGINE=ROCKSDB AUTO_INCREMENT=18446744073709551615; +INSERT INTO t1 VALUES (NULL); +ERROR HY000: Failed to read auto-increment value from storage engine +SELECT * FROM t1; +i +ALTER TABLE t1 AUTO_INCREMENT=1; +INSERT INTO t1 VALUES (NULL); +SELECT * FROM t1; +i +10 +ALTER TABLE t1 AUTO_INCREMENT=18446744073709551615; +INSERT INTO t1 VALUES (NULL); +ERROR HY000: Failed to read auto-increment value from storage engine +SELECT * FROM t1; +i +10 +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rc.result b/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rc.result new file mode 100644 index 00000000000..973d1876fa0 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rc.result @@ -0,0 +1,87 @@ +include/master-slave.inc +Warnings: +Note #### Sending passwords in plain text without SSL/TLS is extremely insecure. +Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information. +[connection master] +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +set @save_rocksdb_blind_delete_primary_key=@@session.rocksdb_blind_delete_primary_key; +set @save_rocksdb_master_skip_tx_api=@@session.rocksdb_master_skip_tx_api; +DROP TABLE IF EXISTS t1,t2; +create table t1 (id int primary key, value int, value2 varchar(200)) engine=rocksdb; +create table t2 (id int primary key, value int, value2 varchar(200), index(value)) engine=rocksdb; +SET session rocksdb_blind_delete_primary_key=1; +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +1000 +SELECT count(*) FROM t1; +count(*) +9000 +include/sync_slave_sql_with_master.inc +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +SELECT count(*) FROM t1; +count(*) +9000 +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +0 +SELECT count(*) FROM t2; +count(*) +9000 +SET session rocksdb_master_skip_tx_api=1; +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +1000 +SELECT count(*) FROM t1; +count(*) +8000 +SELECT count(*) FROM t2; +count(*) +8000 +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +8000 +SELECT count(*) FROM t2; +count(*) +8000 +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +DELETE FROM t1 WHERE id BETWEEN 3001 AND 4000; +DELETE FROM t2 WHERE id BETWEEN 3001 AND 4000; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +0 +SELECT count(*) FROM t1; +count(*) +7000 +SELECT count(*) FROM t2; +count(*) +7000 +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +7000 +SELECT count(*) FROM t2; +count(*) +7000 +DELETE FROM t1 WHERE id = 10; +SELECT count(*) FROM t1; +count(*) +7000 +call mtr.add_suppression("Slave SQL.*Could not execute Delete_rows event on table test.t1.*Error_code.*"); +call mtr.add_suppression("Slave: Can't find record in 't1'.*"); +include/wait_for_slave_sql_error.inc [errno=1032] +set @save_rocksdb_read_free_rpl=@@global.rocksdb_read_free_rpl; +set global rocksdb_read_free_rpl=PK_SK; +START SLAVE; +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +7000 +set global rocksdb_read_free_rpl=@save_rocksdb_read_free_rpl; +SET session rocksdb_blind_delete_primary_key=@save_rocksdb_blind_delete_primary_key; +SET session rocksdb_master_skip_tx_api=@save_rocksdb_master_skip_tx_api; +DROP TABLE t1, t2; +include/rpl_end.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rr.result b/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rr.result new file mode 100644 index 00000000000..683b672e360 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_rr.result @@ -0,0 +1,87 @@ +include/master-slave.inc +Warnings: +Note #### Sending passwords in plain text without SSL/TLS is extremely insecure. +Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information. +[connection master] +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +set @save_rocksdb_blind_delete_primary_key=@@session.rocksdb_blind_delete_primary_key; +set @save_rocksdb_master_skip_tx_api=@@session.rocksdb_master_skip_tx_api; +DROP TABLE IF EXISTS t1,t2; +create table t1 (id int primary key, value int, value2 varchar(200)) engine=rocksdb; +create table t2 (id int primary key, value int, value2 varchar(200), index(value)) engine=rocksdb; +SET session rocksdb_blind_delete_primary_key=1; +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +1000 +SELECT count(*) FROM t1; +count(*) +9000 +include/sync_slave_sql_with_master.inc +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +SELECT count(*) FROM t1; +count(*) +9000 +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +0 +SELECT count(*) FROM t2; +count(*) +9000 +SET session rocksdb_master_skip_tx_api=1; +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +1000 +SELECT count(*) FROM t1; +count(*) +8000 +SELECT count(*) FROM t2; +count(*) +8000 +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +8000 +SELECT count(*) FROM t2; +count(*) +8000 +select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +DELETE FROM t1 WHERE id BETWEEN 3001 AND 4000; +DELETE FROM t2 WHERE id BETWEEN 3001 AND 4000; +select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; +variable_value-@c +0 +SELECT count(*) FROM t1; +count(*) +7000 +SELECT count(*) FROM t2; +count(*) +7000 +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +7000 +SELECT count(*) FROM t2; +count(*) +7000 +DELETE FROM t1 WHERE id = 10; +SELECT count(*) FROM t1; +count(*) +7000 +call mtr.add_suppression("Slave SQL.*Could not execute Delete_rows event on table test.t1.*Error_code.*"); +call mtr.add_suppression("Slave: Can't find record in 't1'.*"); +include/wait_for_slave_sql_error.inc [errno=1032] +set @save_rocksdb_read_free_rpl=@@global.rocksdb_read_free_rpl; +set global rocksdb_read_free_rpl=PK_SK; +START SLAVE; +include/sync_slave_sql_with_master.inc +SELECT count(*) FROM t1; +count(*) +7000 +set global rocksdb_read_free_rpl=@save_rocksdb_read_free_rpl; +SET session rocksdb_blind_delete_primary_key=@save_rocksdb_blind_delete_primary_key; +SET session rocksdb_master_skip_tx_api=@save_rocksdb_master_skip_tx_api; +DROP TABLE t1, t2; +include/rpl_end.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_without_tx_api.result b/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_without_tx_api.result deleted file mode 100644 index a3fc25cc81b..00000000000 --- a/storage/rocksdb/mysql-test/rocksdb/r/blind_delete_without_tx_api.result +++ /dev/null @@ -1,85 +0,0 @@ -include/master-slave.inc -Warnings: -Note #### Sending passwords in plain text without SSL/TLS is extremely insecure. -Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information. -[connection master] -set @save_rocksdb_blind_delete_primary_key=@@session.rocksdb_blind_delete_primary_key; -set @save_rocksdb_master_skip_tx_api=@@session.rocksdb_master_skip_tx_api; -DROP TABLE IF EXISTS t1,t2; -create table t1 (id int primary key, value int, value2 varchar(200)) engine=rocksdb; -create table t2 (id int primary key, value int, value2 varchar(200), index(value)) engine=rocksdb; -SET session rocksdb_blind_delete_primary_key=1; -select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -variable_value-@c -1000 -SELECT count(*) FROM t1; -count(*) -9000 -include/sync_slave_sql_with_master.inc -SELECT count(*) FROM t1; -count(*) -9000 -select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -variable_value-@c -0 -SELECT count(*) FROM t2; -count(*) -9000 -SET session rocksdb_master_skip_tx_api=1; -select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -variable_value-@c -1000 -SELECT count(*) FROM t1; -count(*) -8000 -SELECT count(*) FROM t2; -count(*) -8000 -include/sync_slave_sql_with_master.inc -SELECT count(*) FROM t1; -count(*) -8000 -SELECT count(*) FROM t2; -count(*) -8000 -select variable_value into @c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -DELETE FROM t1 WHERE id BETWEEN 3001 AND 4000; -DELETE FROM t2 WHERE id BETWEEN 3001 AND 4000; -select variable_value-@c from information_schema.global_status where variable_name='rocksdb_rows_deleted_blind'; -variable_value-@c -0 -SELECT count(*) FROM t1; -count(*) -7000 -SELECT count(*) FROM t2; -count(*) -7000 -include/sync_slave_sql_with_master.inc -SELECT count(*) FROM t1; -count(*) -7000 -SELECT count(*) FROM t2; -count(*) -7000 -DELETE FROM t1 WHERE id = 10; -SELECT count(*) FROM t1; -count(*) -7000 -call mtr.add_suppression("Slave SQL.*Could not execute Delete_rows event on table test.t1.*Error_code.*"); -call mtr.add_suppression("Slave: Can't find record in 't1'.*"); -include/wait_for_slave_sql_error.inc [errno=1032] -set @save_rocksdb_read_free_rpl_tables=@@global.rocksdb_read_free_rpl_tables; -set global rocksdb_read_free_rpl_tables="t.*"; -START SLAVE; -include/sync_slave_sql_with_master.inc -SELECT count(*) FROM t1; -count(*) -7000 -set global rocksdb_read_free_rpl_tables=@save_rocksdb_read_free_rpl_tables; -SET session rocksdb_blind_delete_primary_key=@save_rocksdb_blind_delete_primary_key; -SET session rocksdb_master_skip_tx_api=@save_rocksdb_master_skip_tx_api; -DROP TABLE t1, t2; -include/rpl_end.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter3.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter3.result index c7b5c42f2b3..ebb17d15878 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter3.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter3.result @@ -21,12 +21,15 @@ select case when variable_value-@c > 0 then 'true' else 'false' end from informa case when variable_value-@c > 0 then 'true' else 'false' end true select variable_value into @c from information_schema.global_status where variable_name='rocksdb_bloom_filter_prefix_checked'; +set @tmp_force_index_for_range=@@optimizer_force_index_for_range; +set optimizer_force_index_for_range=on; select id1, id2, link_type, visibility, data, time, version from linktable FORCE INDEX(`id1_type2`) where id1 = 100 and link_type = 1 and time >= 0 and time <= 9223372036854775807 order by time desc; id1 id2 link_type visibility data time version 100 100 1 1 100 100 100 select case when variable_value-@c > 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_bloom_filter_prefix_checked'; case when variable_value-@c > 0 then 'true' else 'false' end true +set global optimizer_force_index_for_range=@tmp_force_index_for_range; select variable_value into @c from information_schema.global_status where variable_name='rocksdb_bloom_filter_prefix_checked'; select id1, id2, link_type, visibility, data, time, version from linktable FORCE INDEX(`id1_type3`) where id1 = 100 and time >= 0 and time <= 9223372036854775807 and visibility = 1 order by time desc; id1 id2 link_type visibility data time version @@ -101,7 +104,7 @@ insert into t1 values (21,2,2,0x12FFFFFFFFFF,1); explain select * from t1 where kp0=1 and kp1=1 and kp2=0x12FFFFFFFFFF order by kp3 desc; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 index kp12 kp12 28 NULL # Using where; Using index +1 SIMPLE t1 ref kp12 kp12 20 const,const,const # Using where; Using index show status like '%rocksdb_bloom_filter_prefix%'; Variable_name Value rocksdb_bloom_filter_prefix_checked 0 diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result index 058d3608c75..4cde60d7447 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result @@ -59,4 +59,27 @@ insert into t4 values (1, 0xFFFF, 0xFFF, 12345); # This must not fail an assert: select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc; pk kp1 kp2 col1 -drop table t1,t2,t3,t4; +# +# Issue #881: Issue #809 still occurs for reverse scans on forward cfs +# +create table t5 ( +id1 bigint not null, +id2 bigint not null, +id3 varchar(100) not null, +id4 int not null, +id5 int not null, +value bigint, +value2 varchar(100), +primary key (id1, id2, id3, id4) COMMENT 'bf5_1' +) engine=ROCKSDB; +insert into t5 select * from t1; +set global rocksdb_force_flush_memtable_now=1; +# An index scan starting from the end of the table: +explain +select * from t5 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t5 index NULL PRIMARY 122 NULL 1 NULL +select * from t5 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id1 id2 id3 id4 id5 value value2 +1000 2000 2000 10000 10000 1000 aaabbbccc +drop table t1,t2,t3,t4,t5; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load.result index 99d36e0e7da..a36f99a7619 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load.result @@ -21,18 +21,20 @@ PRIMARY KEY(pk) COMMENT "cf1", KEY(a) ) ENGINE=ROCKSDB COLLATE 'latin1_bin' PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 +SET @@GLOBAL.ROCKSDB_UPDATE_CF_OPTIONS= +'cf1={write_buffer_size=8m;target_file_size_base=1m};'; set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; +pk a b LOAD DATA INFILE INTO TABLE t2; +pk a b LOAD DATA INFILE INTO TABLE t3; +pk a b set rocksdb_bulk_load=0; SHOW TABLE STATUS WHERE name LIKE 't%'; Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf.result index 684e0efeeca..b5d3e252c5d 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf.result @@ -21,18 +21,20 @@ PRIMARY KEY(pk) COMMENT "rev:cf1", KEY(a) ) ENGINE=ROCKSDB COLLATE 'latin1_bin' PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 +SET @@GLOBAL.ROCKSDB_UPDATE_CF_OPTIONS= +'cf1={write_buffer_size=8m;target_file_size_base=1m};'; set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; +pk a b LOAD DATA INFILE INTO TABLE t2; +pk a b LOAD DATA INFILE INTO TABLE t3; +pk a b set rocksdb_bulk_load=0; SHOW TABLE STATUS WHERE name LIKE 't%'; Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf_and_data.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf_and_data.result index 1bee69ec8d4..f46acd41080 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf_and_data.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_cf_and_data.result @@ -21,18 +21,20 @@ PRIMARY KEY(pk) COMMENT "rev:cf1", KEY(a) ) ENGINE=ROCKSDB COLLATE 'latin1_bin' PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 +SET @@GLOBAL.ROCKSDB_UPDATE_CF_OPTIONS= +'cf1={write_buffer_size=8m;target_file_size_base=1m};'; set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; +pk a b LOAD DATA INFILE INTO TABLE t2; +pk a b LOAD DATA INFILE INTO TABLE t3; +pk a b set rocksdb_bulk_load=0; SHOW TABLE STATUS WHERE name LIKE 't%'; Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_data.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_data.result index 403d72e185a..3389968ef37 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_data.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_rev_data.result @@ -21,18 +21,20 @@ PRIMARY KEY(pk) COMMENT "cf1", KEY(a) ) ENGINE=ROCKSDB COLLATE 'latin1_bin' PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 +SET @@GLOBAL.ROCKSDB_UPDATE_CF_OPTIONS= +'cf1={write_buffer_size=8m;target_file_size_base=1m};'; set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; +pk a b LOAD DATA INFILE INTO TABLE t2; +pk a b LOAD DATA INFILE INTO TABLE t3; +pk a b set rocksdb_bulk_load=0; SHOW TABLE STATUS WHERE name LIKE 't%'; Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result index 0245edf735b..924032549ac 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result @@ -58,13 +58,10 @@ ENGINE=ROCKSDB; CREATE TABLE t3(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1") ENGINE=ROCKSDB PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_rev.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_rev.result index fcd05fd60b4..3cc9fb8e459 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_rev.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_rev.result @@ -58,13 +58,10 @@ ENGINE=ROCKSDB; CREATE TABLE t3(a INT, b INT, PRIMARY KEY(a) COMMENT "rev:cf1") ENGINE=ROCKSDB PARTITION BY KEY() PARTITIONS 4; set session transaction isolation level repeatable read; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 0 start transaction with consistent snapshot; -select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; -STAT_TYPE VALUE -DB_NUM_SNAPSHOTS 1 +select VALUE > 0 as 'Has opened snapshots' from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +Has opened snapshots +1 set rocksdb_bulk_load=1; set rocksdb_bulk_load_size=100000; LOAD DATA INFILE INTO TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic.result b/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic.result new file mode 100644 index 00000000000..1f687dfec53 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic.result @@ -0,0 +1,693 @@ +CREATE TABLE `link_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id1_type` int(10) unsigned NOT NULL DEFAULT '0' , +`id2` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id2_type` int(10) unsigned NOT NULL DEFAULT '0' , +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , +`visibility` tinyint(3) NOT NULL DEFAULT '0' , +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , +`time` int(10) unsigned NOT NULL DEFAULT '0' , +`version` bigint(20) unsigned NOT NULL DEFAULT '0' , +PRIMARY KEY (`link_type` , `id1` , `id2`) COMMENT 'cf_link' , +KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , +`version` , `data`) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +CREATE TABLE `link_table2` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id1_type` int(10) unsigned NOT NULL DEFAULT '0' , +`id2` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id2_type` int(10) unsigned NOT NULL DEFAULT '0' , +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , +`visibility` tinyint(3) NOT NULL DEFAULT '0' , +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , +`time` int(10) unsigned NOT NULL DEFAULT '0' , +`version` bigint(20) unsigned NOT NULL DEFAULT '0' , +PRIMARY KEY (`link_type` , `id1` , `id2`) +COMMENT 'cf_link' , +KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , +`version` , `data`) COMMENT 'cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=9; +insert into link_table values (1, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (1, 1, 2, 2, 3, 3, 'a10', 10, 125); +insert into link_table values (1, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 5, 2, 3, 3, 'a12', 12, 125); +insert into link_table values (1, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (1, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (1, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 4, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 4, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 4, 4, 'a15', 15, 125); +insert into link_table values (3, 1, 10, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 9, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 8, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 7, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 4, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 3, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (3, 1, 2, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (3, 1, 1, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (9, 1, 9, 2, 5, 6, '0 ', 10, 125); +insert into link_table values (9, 1, 8, 2, 5, 6, '01 ', 11, 125); +insert into link_table values (9, 1, 7, 2, 5, 6, '012 ', 11, 125); +insert into link_table values (9, 1, 6, 2, 5, 6, '0123 ', 12, 125); +insert into link_table values (9, 1, 5, 2, 5, 6, '01234 ', 12, 125); +insert into link_table values (9, 1, 4, 2, 5, 6, '012345 ', 12, 125); +insert into link_table values (9, 1, 3, 2, 5, 6, '0123456 ', 13, 125); +insert into link_table values (9, 1, 2, 2, 5, 6, '01234567 ', 14, 125); +insert into link_table values (9, 1, 1, 2, 5, 6, '012345678 ', 15, 125); +insert into link_table values (9, 1, 0, 2, 5, 6, '0123456789 ', 15, 125); +insert into link_table2 select * from link_table; +CREATE TABLE `id_table` ( +`id` bigint(20) NOT NULL DEFAULT '0', +`type` int(11) NOT NULL DEFAULT '0', +`row_created_time` int(11) NOT NULL DEFAULT '0', +`hash_key` varchar(255) NOT NULL DEFAULT '', +`is_deleted` tinyint(4) DEFAULT '0', +PRIMARY KEY (`id`), +KEY `type_id` (`type`,`id`) +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 ROW_FORMAT=COMPRESSED +KEY_BLOCK_SIZE=8; +insert into id_table values (1, 1, 10, '111', 0); +insert into id_table values (2, 1, 10, '111', 1); +insert into id_table values (3, 1, 10, '111', 0); +insert into id_table values (4, 1, 10, '111', 1); +insert into id_table values (5, 1, 10, '111', 0); +insert into id_table values (6, 1, 10, '111', 1); +insert into id_table values (7, 1, 10, '111', 0); +insert into id_table values (8, 1, 10, '111', 1); +insert into id_table values (9, 1, 10, '111', 0); +insert into id_table values (10, 1, 10, '111', 1); +CREATE TABLE `node_table` ( +`id` bigint(20) unsigned NOT NULL DEFAULT '0', +`type` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +`update_time` int(10) unsigned NOT NULL DEFAULT '0', +`data` mediumtext COLLATE latin1_bin NOT NULL, +PRIMARY KEY (`type`,`id`) COMMENT 'cf_node_type_id', +KEY `id` (`id`) COMMENT 'cf_node' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into node_table values (1, 1, 1, 10, 'data'); +insert into node_table values (2, 1, 1, 10, 'data'); +insert into node_table values (3, 1, 1, 10, 'data'); +insert into node_table values (4, 1, 1, 10, 'data'); +insert into node_table values (5, 1, 1, 10, 'data'); +insert into node_table values (6, 1, 1, 10, 'data'); +insert into node_table values (7, 1, 1, 10, 'data'); +insert into node_table values (8, 1, 1, 10, 'data'); +insert into node_table values (9, 1, 1, 10, 'data'); +insert into node_table values (10, 1, 1, 10, 'data'); +CREATE TABLE `count_table` ( +`id` bigint(20) unsigned NOT NULL DEFAULT '0', +`type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`count` int(10) unsigned NOT NULL DEFAULT '0', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`id`,`link_type`) COMMENT 'cf_count_table' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into count_table values (2, 1, 1, 1, 10, 20); +insert into count_table values (3, 1, 1, 1, 10, 20); +insert into count_table values (4, 1, 1, 1, 10, 20); +insert into count_table values (5, 1, 1, 1, 10, 20); +insert into count_table values (6, 1, 1, 1, 10, 20); +insert into count_table values (7, 1, 1, 1, 10, 20); +insert into count_table values (8, 1, 1, 1, 10, 20); +insert into count_table values (9, 1, 1, 1, 10, 20); +insert into count_table values (10, 1, 1, 1, 10, 20); +CREATE TABLE `link_table5` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into link_table5 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table5 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table5 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table5 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table5 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table5 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table5 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); +CREATE TABLE `link_table3` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`) +COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +insert into link_table3 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table3 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table3 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table3 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table3 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table3 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table3 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); +CREATE TABLE `link_table6` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`, +`data`(255)) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +insert into link_table6 values (1, 1, 2, 2, 1, 1, +'data12_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 3, 2, 1, 2, +'data13_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 4, 2, 1, 2, +'data14_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 5, 2, 1, 1, +'data15_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 1, 2, 1, 1, +'data21_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 2, 2, 1, 1, +'data22_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 3, 2, 1, 1, +'data32_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +CREATE TABLE `link_table4` ( +`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', +`raw_key` text COLLATE latin1_bin, +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`,`data`) +COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into link_table4 values ('a1', "rk1", 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table4 values ('a1', "rk2", 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table4 values ('a1', "rk3", 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table4 values ('a1', "rk4", 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table4 values ('b1', "rk5", 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table4 values ('b1', "rk6", 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table4 values ('b1', "rk7", 3, 2, 1, 1, 'data32', 1, 1); +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ no_bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypassabc*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /* +bypassabc*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*aaaaaaaaabbbbbbbbb*/ id1,id2,id1_type,id2_type,data,version +from link_table WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+b*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+byp*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypw*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*-b*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /**/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +# Point query +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2 IN (2, 3, 4) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +1 3 1 2 a11 125 +1 4 1 2 a11 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2 IN (2) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1 IN (1) and id2 IN (2) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1 IN (1, 2) and id2 IN (2, 3, 4) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +1 3 1 2 a11 125 +1 4 1 2 a11 125 +2 2 1 2 a10 125 +2 3 1 2 a11 125 +2 4 1 2 a11 125 +# Prefix range query +# Prefix range query with SK +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 3 AND time = 10 +ORDER BY TIME DESC LIMIT 10; +id1 id2 link_type visibility data time version +1 2 3 3 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 3 AND time = 10 +ORDER BY TIME ASC LIMIT 10; +id1 id2 link_type visibility data time version +1 2 3 3 a10 10 125 +# Prefix range query with SK with limits +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,10; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,5; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,1; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,10; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,5; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,1; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,10; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,5; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,1; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,10; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,5; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,1; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,0; +id1 id2 link_type visibility data time version +# Prefix range query with PK +SELECT /*+ bypass */ id1, id2, link_type FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 DESC; +id1 id2 link_type +1 10 3 +1 9 3 +1 8 3 +1 7 3 +1 6 3 +1 5 3 +1 4 3 +1 3 3 +1 2 3 +1 1 3 +SELECT /*+ bypass */ id1, id2, link_type FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 ASC; +id1 id2 link_type +1 1 3 +1 2 3 +1 3 3 +1 4 3 +1 5 3 +1 6 3 +1 7 3 +1 8 3 +1 9 3 +1 10 3 +# Prefix range query with PK + value +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 DESC; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 5 3 3 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 2 3 3 a10 10 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 ASC; +id1 id2 link_type visibility data time version +1 1 3 4 a10 10 125 +1 2 3 3 a10 10 125 +1 3 3 4 a11 11 125 +1 4 3 4 a11 11 125 +1 5 3 3 a12 12 125 +1 6 3 4 a12 12 125 +1 7 3 4 a12 12 125 +1 8 3 4 a13 13 125 +1 9 3 4 a14 14 125 +1 10 3 4 a15 15 125 +# Transaction +BEGIN; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +UPDATE link_table set data="bcd" WHERE id1=1 and id2=2 and link_type = 3; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +COMMIT; +BEGIN; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +UPDATE link_table set data="cde" WHERE id1=1 and id2=2 and link_type = 3; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 cde 125 +ROLLBACK; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +# Data types +SELECT /*+ bypass */ id1 FROM link_table where link_type="3"; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +3 +3 +3 +3 +3 +3 +3 +3 +3 +3 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1="1"; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=True; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=b'1'; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=x'01'; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=NULL; +id1 +DROP TABLE count_table; +DROP TABLE link_table; +DROP TABLE link_table3; +DROP TABLE link_table2; +DROP TABLE id_table; +DROP TABLE node_table; +DROP TABLE link_table5; +DROP TABLE link_table6; +DROP TABLE link_table4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic_bloom.result b/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic_bloom.result new file mode 100644 index 00000000000..1f687dfec53 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bypass_select_basic_bloom.result @@ -0,0 +1,693 @@ +CREATE TABLE `link_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id1_type` int(10) unsigned NOT NULL DEFAULT '0' , +`id2` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id2_type` int(10) unsigned NOT NULL DEFAULT '0' , +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , +`visibility` tinyint(3) NOT NULL DEFAULT '0' , +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , +`time` int(10) unsigned NOT NULL DEFAULT '0' , +`version` bigint(20) unsigned NOT NULL DEFAULT '0' , +PRIMARY KEY (`link_type` , `id1` , `id2`) COMMENT 'cf_link' , +KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , +`version` , `data`) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +CREATE TABLE `link_table2` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id1_type` int(10) unsigned NOT NULL DEFAULT '0' , +`id2` bigint(20) unsigned NOT NULL DEFAULT '0' , +`id2_type` int(10) unsigned NOT NULL DEFAULT '0' , +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0' , +`visibility` tinyint(3) NOT NULL DEFAULT '0' , +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '' , +`time` int(10) unsigned NOT NULL DEFAULT '0' , +`version` bigint(20) unsigned NOT NULL DEFAULT '0' , +PRIMARY KEY (`link_type` , `id1` , `id2`) +COMMENT 'cf_link' , +KEY `id1_type` (`id1` , `link_type` , `visibility` , `time` , `id2` , +`version` , `data`) COMMENT 'cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=9; +insert into link_table values (1, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (1, 1, 2, 2, 3, 3, 'a10', 10, 125); +insert into link_table values (1, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (1, 1, 5, 2, 3, 3, 'a12', 12, 125); +insert into link_table values (1, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (1, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (1, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (1, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (2, 1, 1, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 2, 2, 4, 4, 'a10', 10, 125); +insert into link_table values (2, 1, 3, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 4, 2, 4, 4, 'a11', 11, 125); +insert into link_table values (2, 1, 5, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 6, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 7, 2, 4, 4, 'a12', 12, 125); +insert into link_table values (2, 1, 8, 2, 4, 4, 'a13', 13, 125); +insert into link_table values (2, 1, 9, 2, 4, 4, 'a14', 14, 125); +insert into link_table values (2, 1, 10, 2, 4, 4, 'a15', 15, 125); +insert into link_table values (3, 1, 10, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 9, 2, 3, 4, 'a10', 10, 125); +insert into link_table values (3, 1, 8, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 7, 2, 3, 4, 'a11', 11, 125); +insert into link_table values (3, 1, 6, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 5, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 4, 2, 3, 4, 'a12', 12, 125); +insert into link_table values (3, 1, 3, 2, 3, 4, 'a13', 13, 125); +insert into link_table values (3, 1, 2, 2, 3, 4, 'a14', 14, 125); +insert into link_table values (3, 1, 1, 2, 3, 4, 'a15', 15, 125); +insert into link_table values (9, 1, 9, 2, 5, 6, '0 ', 10, 125); +insert into link_table values (9, 1, 8, 2, 5, 6, '01 ', 11, 125); +insert into link_table values (9, 1, 7, 2, 5, 6, '012 ', 11, 125); +insert into link_table values (9, 1, 6, 2, 5, 6, '0123 ', 12, 125); +insert into link_table values (9, 1, 5, 2, 5, 6, '01234 ', 12, 125); +insert into link_table values (9, 1, 4, 2, 5, 6, '012345 ', 12, 125); +insert into link_table values (9, 1, 3, 2, 5, 6, '0123456 ', 13, 125); +insert into link_table values (9, 1, 2, 2, 5, 6, '01234567 ', 14, 125); +insert into link_table values (9, 1, 1, 2, 5, 6, '012345678 ', 15, 125); +insert into link_table values (9, 1, 0, 2, 5, 6, '0123456789 ', 15, 125); +insert into link_table2 select * from link_table; +CREATE TABLE `id_table` ( +`id` bigint(20) NOT NULL DEFAULT '0', +`type` int(11) NOT NULL DEFAULT '0', +`row_created_time` int(11) NOT NULL DEFAULT '0', +`hash_key` varchar(255) NOT NULL DEFAULT '', +`is_deleted` tinyint(4) DEFAULT '0', +PRIMARY KEY (`id`), +KEY `type_id` (`type`,`id`) +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 ROW_FORMAT=COMPRESSED +KEY_BLOCK_SIZE=8; +insert into id_table values (1, 1, 10, '111', 0); +insert into id_table values (2, 1, 10, '111', 1); +insert into id_table values (3, 1, 10, '111', 0); +insert into id_table values (4, 1, 10, '111', 1); +insert into id_table values (5, 1, 10, '111', 0); +insert into id_table values (6, 1, 10, '111', 1); +insert into id_table values (7, 1, 10, '111', 0); +insert into id_table values (8, 1, 10, '111', 1); +insert into id_table values (9, 1, 10, '111', 0); +insert into id_table values (10, 1, 10, '111', 1); +CREATE TABLE `node_table` ( +`id` bigint(20) unsigned NOT NULL DEFAULT '0', +`type` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +`update_time` int(10) unsigned NOT NULL DEFAULT '0', +`data` mediumtext COLLATE latin1_bin NOT NULL, +PRIMARY KEY (`type`,`id`) COMMENT 'cf_node_type_id', +KEY `id` (`id`) COMMENT 'cf_node' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into node_table values (1, 1, 1, 10, 'data'); +insert into node_table values (2, 1, 1, 10, 'data'); +insert into node_table values (3, 1, 1, 10, 'data'); +insert into node_table values (4, 1, 1, 10, 'data'); +insert into node_table values (5, 1, 1, 10, 'data'); +insert into node_table values (6, 1, 1, 10, 'data'); +insert into node_table values (7, 1, 1, 10, 'data'); +insert into node_table values (8, 1, 1, 10, 'data'); +insert into node_table values (9, 1, 1, 10, 'data'); +insert into node_table values (10, 1, 1, 10, 'data'); +CREATE TABLE `count_table` ( +`id` bigint(20) unsigned NOT NULL DEFAULT '0', +`type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`count` int(10) unsigned NOT NULL DEFAULT '0', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`id`,`link_type`) COMMENT 'cf_count_table' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into count_table values (2, 1, 1, 1, 10, 20); +insert into count_table values (3, 1, 1, 1, 10, 20); +insert into count_table values (4, 1, 1, 1, 10, 20); +insert into count_table values (5, 1, 1, 1, 10, 20); +insert into count_table values (6, 1, 1, 1, 10, 20); +insert into count_table values (7, 1, 1, 1, 10, 20); +insert into count_table values (8, 1, 1, 1, 10, 20); +insert into count_table values (9, 1, 1, 1, 10, 20); +insert into count_table values (10, 1, 1, 1, 10, 20); +CREATE TABLE `link_table5` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into link_table5 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table5 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table5 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table5 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table5 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table5 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table5 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); +CREATE TABLE `link_table3` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`) +COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +insert into link_table3 values (1, 1, 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table3 values (1, 1, 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table3 values (1, 1, 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table3 values (1, 1, 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table3 values (2, 1, 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table3 values (2, 1, 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table3 values (2, 1, 3, 2, 1, 1, 'data32', 1, 1); +CREATE TABLE `link_table6` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`, +`data`(255)) COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +insert into link_table6 values (1, 1, 2, 2, 1, 1, +'data12_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 3, 2, 1, 2, +'data13_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 4, 2, 1, 2, +'data14_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (1, 1, 5, 2, 1, 1, +'data15_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 1, 2, 1, 1, +'data21_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 2, 2, 1, 1, +'data22_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +insert into link_table6 values (2, 1, 3, 2, 1, 1, +'data32_12345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890123456789012345678901234567890123456789' + '0123456789012345678901234567890', 1, 1); +CREATE TABLE `link_table4` ( +`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', +`raw_key` text COLLATE latin1_bin, +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`link_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`link_type`,`id1`,`id2`) COMMENT 'cf_link', +KEY `id1_type` (`id1`,`link_type`,`visibility`,`time`,`id2`,`version`,`data`) +COMMENT 'rev:cf_link_id1_type' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin +ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +insert into link_table4 values ('a1', "rk1", 2, 2, 1, 1, 'data12', 1, 1); +insert into link_table4 values ('a1', "rk2", 3, 2, 1, 2, 'data13', 1, 1); +insert into link_table4 values ('a1', "rk3", 4, 2, 1, 2, 'data14', 1, 1); +insert into link_table4 values ('a1', "rk4", 5, 2, 1, 1, 'data15', 1, 1); +insert into link_table4 values ('b1', "rk5", 1, 2, 1, 1, 'data21', 1, 1); +insert into link_table4 values ('b1', "rk6", 2, 2, 1, 1, 'data22', 1, 1); +insert into link_table4 values ('b1', "rk7", 3, 2, 1, 1, 'data32', 1, 1); +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ no_bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypassabc*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /* +bypassabc*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*aaaaaaaaabbbbbbbbb*/ id1,id2,id1_type,id2_type,data,version +from link_table WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+b*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+byp*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+bypw*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*-b*/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /**/ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +# Point query +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2 IN (2, 3, 4) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +1 3 1 2 a11 125 +1 4 1 2 a11 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2 IN (2) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1 IN (1) and id2 IN (2) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1 IN (1, 2) and id2 IN (2, 3, 4) and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +1 3 1 2 a11 125 +1 4 1 2 a11 125 +2 2 1 2 a10 125 +2 3 1 2 a11 125 +2 4 1 2 a11 125 +# Prefix range query +# Prefix range query with SK +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 3 AND time = 10 +ORDER BY TIME DESC LIMIT 10; +id1 id2 link_type visibility data time version +1 2 3 3 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 3 AND time = 10 +ORDER BY TIME ASC LIMIT 10; +id1 id2 link_type visibility data time version +1 2 3 3 a10 10 125 +# Prefix range query with SK with limits +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,10; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,5; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 0,1; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,10; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,5; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 4 3 4 a11 11 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,1; +id1 id2 link_type visibility data time version +1 9 3 4 a14 14 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 1,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,10; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,5; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,1; +id1 id2 link_type visibility data time version +1 4 3 4 a11 11 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 5,0; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,10; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,5; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,1; +id1 id2 link_type visibility data time version +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (id1_type) +WHERE link_type = 3 AND id1 = 1 AND visibility = 4 AND time >= 10 +ORDER BY TIME DESC LIMIT 10,0; +id1 id2 link_type visibility data time version +# Prefix range query with PK +SELECT /*+ bypass */ id1, id2, link_type FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 DESC; +id1 id2 link_type +1 10 3 +1 9 3 +1 8 3 +1 7 3 +1 6 3 +1 5 3 +1 4 3 +1 3 3 +1 2 3 +1 1 3 +SELECT /*+ bypass */ id1, id2, link_type FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 ASC; +id1 id2 link_type +1 1 3 +1 2 3 +1 3 3 +1 4 3 +1 5 3 +1 6 3 +1 7 3 +1 8 3 +1 9 3 +1 10 3 +# Prefix range query with PK + value +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 DESC; +id1 id2 link_type visibility data time version +1 10 3 4 a15 15 125 +1 9 3 4 a14 14 125 +1 8 3 4 a13 13 125 +1 7 3 4 a12 12 125 +1 6 3 4 a12 12 125 +1 5 3 3 a12 12 125 +1 4 3 4 a11 11 125 +1 3 3 4 a11 11 125 +1 2 3 3 a10 10 125 +1 1 3 4 a10 10 125 +SELECT /*+ bypass */ id1, id2, link_type, visibility, data, time, version +FROM link_table FORCE INDEX (PRIMARY) +WHERE link_type=3 and id1=1 ORDER BY id2 ASC; +id1 id2 link_type visibility data time version +1 1 3 4 a10 10 125 +1 2 3 3 a10 10 125 +1 3 3 4 a11 11 125 +1 4 3 4 a11 11 125 +1 5 3 3 a12 12 125 +1 6 3 4 a12 12 125 +1 7 3 4 a12 12 125 +1 8 3 4 a13 13 125 +1 9 3 4 a14 14 125 +1 10 3 4 a15 15 125 +# Transaction +BEGIN; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 a10 125 +UPDATE link_table set data="bcd" WHERE id1=1 and id2=2 and link_type = 3; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +COMMIT; +BEGIN; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +UPDATE link_table set data="cde" WHERE id1=1 and id2=2 and link_type = 3; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 cde 125 +ROLLBACK; +SELECT /*+ bypass */ id1,id2,id1_type,id2_type,data,version from link_table +WHERE id1=1 and id2=2 and link_type=3; +id1 id2 id1_type id2_type data version +1 2 1 2 bcd 125 +# Data types +SELECT /*+ bypass */ id1 FROM link_table where link_type="3"; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +3 +3 +3 +3 +3 +3 +3 +3 +3 +3 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1="1"; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=True; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=b'1'; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=x'01'; +id1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT /*+ bypass */ id1 FROM link_table where link_type="3" AND id1=NULL; +id1 +DROP TABLE count_table; +DROP TABLE link_table; +DROP TABLE link_table3; +DROP TABLE link_table2; +DROP TABLE id_table; +DROP TABLE node_table; +DROP TABLE link_table5; +DROP TABLE link_table6; +DROP TABLE link_table4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/check_flags.result b/storage/rocksdb/mysql-test/rocksdb/r/check_flags.result new file mode 100644 index 00000000000..32369c12136 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/check_flags.result @@ -0,0 +1,47 @@ +set debug_sync='RESET'; +set global rocksdb_debug_ttl_read_filter_ts = -10; +CREATE TABLE t1 (id INT, value INT, KEY (id), KEY (value)) ENGINE=ROCKSDB; +CREATE TABLE t2 (id INT, value INT) ENGINE=ROCKSDB; +CREATE TABLE t3 (id INT, kp1 INT, PRIMARY KEY (id), KEY(kp1)) ENGINE=ROCKSDB COMMENT='ttl_duration=1'; +INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4), (5,5); +INSERT INTO t2 SELECT * FROM t1; +INSERT INTO t3 SELECT * FROM t1; +set debug_sync='rocksdb.check_flags_rmi SIGNAL parked WAIT_FOR go'; +SELECT value FROM t1 WHERE value = 3; +set debug_sync='now WAIT_FOR parked'; +KILL QUERY $conn1_id; +set debug_sync='now SIGNAL go'; +ERROR 70100: Query execution was interrupted +set debug_sync='RESET'; +set debug_sync='rocksdb.check_flags_rmi_scan SIGNAL parked WAIT_FOR go'; +SELECT DISTINCT(id) FROM t1 WHERE value = 5 AND id IN (1, 3, 5); +set debug_sync='now WAIT_FOR parked'; +KILL QUERY $conn1_id; +set debug_sync='now SIGNAL go'; +ERROR 70100: Query execution was interrupted +set debug_sync='RESET'; +set debug_sync='rocksdb.check_flags_inwd SIGNAL parked WAIT_FOR go'; +SELECT value FROM t1 WHERE value > 3; +set debug_sync='now WAIT_FOR parked'; +KILL QUERY $conn1_id; +set debug_sync='now SIGNAL go'; +ERROR 70100: Query execution was interrupted +set debug_sync='RESET'; +set debug_sync='rocksdb.check_flags_rnwd SIGNAL parked WAIT_FOR go'; +SELECT id FROM t2; +set debug_sync='now WAIT_FOR parked'; +KILL QUERY $conn1_id; +set debug_sync='now SIGNAL go'; +ERROR 70100: Query execution was interrupted +set debug_sync='RESET'; +set debug_sync='rocksdb.check_flags_ser SIGNAL parked WAIT_FOR go'; +SELECT kp1 FROM t3 ORDER BY kp1; +set debug_sync='now WAIT_FOR parked'; +KILL QUERY $conn1_id; +set debug_sync='now SIGNAL go'; +ERROR 70100: Query execution was interrupted +set debug_sync='RESET'; +set global rocksdb_debug_ttl_read_filter_ts = DEFAULT; +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result index 7f1e3d8e53f..2b6448d1bf0 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result @@ -5,7 +5,7 @@ connection con1; CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=ROCKSDB; SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 connection con2; select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; STAT_TYPE VALUE @@ -18,7 +18,7 @@ STAT_TYPE VALUE DB_NUM_SNAPSHOTS 0 connection con1; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 connection con2; INSERT INTO t1 (a) VALUES (1); connection con1; @@ -69,7 +69,7 @@ id value value2 5 5 5 6 6 6 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 connection con2; INSERT INTO r1 values (7,7,7); connection con1; @@ -107,12 +107,12 @@ id value value2 7 7 7 8 8 8 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 connection con2; INSERT INTO r1 values (9,9,9); connection con1; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 connection con2; INSERT INTO r1 values (10,10,10); connection con1; @@ -129,7 +129,7 @@ id value value2 9 9 9 10 10 10 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 12048 +ERROR: 50048 INSERT INTO r1 values (11,11,11); ERROR: 0 SELECT * FROM r1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result index 90723ff762c..bec37e4f870 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result @@ -125,7 +125,7 @@ id value value2 START TRANSACTION WITH CONSISTENT SNAPSHOT; ERROR: 0 INSERT INTO r1 values (11,11,11); -ERROR: 12045 +ERROR: 50045 SELECT * FROM r1; id value value2 1 1 1 diff --git a/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result b/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result index b7d0f99c716..d60f1549b43 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result @@ -20,7 +20,7 @@ set @tmp1=@@rocksdb_verify_row_debug_checksums; set rocksdb_verify_row_debug_checksums=1; set session debug= "+d,myrocks_simulate_bad_row_read1"; select * from t1 where pk=1; -ERROR HY000: Got error 199 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 200 'Found data corruption.' from ROCKSDB set session debug= "-d,myrocks_simulate_bad_row_read1"; set rocksdb_verify_row_debug_checksums=@tmp1; select * from t1 where pk=1; @@ -28,11 +28,11 @@ pk col1 1 1 set session debug= "+d,myrocks_simulate_bad_row_read2"; select * from t1 where pk=1; -ERROR HY000: Got error 199 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 200 'Found data corruption.' from ROCKSDB set session debug= "-d,myrocks_simulate_bad_row_read2"; set session debug= "+d,myrocks_simulate_bad_row_read3"; select * from t1 where pk=1; -ERROR HY000: Got error 199 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 200 'Found data corruption.' from ROCKSDB set session debug= "-d,myrocks_simulate_bad_row_read3"; insert into t1 values(4,'0123456789'); select * from t1; @@ -56,7 +56,7 @@ pk col1 ABCD 1 set session debug= "+d,myrocks_simulate_bad_pk_read1"; select * from t2; -ERROR HY000: Got error 199 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 200 'Found data corruption.' from ROCKSDB set session debug= "-d,myrocks_simulate_bad_pk_read1"; drop table t2; create table t2 ( @@ -69,6 +69,6 @@ pk col1 ABCD 1 set session debug= "+d,myrocks_simulate_bad_pk_read1"; select * from t2; -ERROR HY000: Got error 199 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 200 'Found data corruption.' from ROCKSDB set session debug= "-d,myrocks_simulate_bad_pk_read1"; drop table t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result index e5aeb57ebdf..1c45cfd09fe 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result @@ -8,7 +8,7 @@ ERROR HY000: Table without primary key cannot be created outside mysql schema. CREATE TABLE IF NOT EXISTS mysql_table_2 (a INT) ENGINE=ROCKSDB; ERROR HY000: Table without primary key cannot be created outside mysql schema. CREATE TABLE mysql_table_no_cols ENGINE=ROCKSDB; -ERROR HY000: Table without primary key cannot be created outside mysql schema. +ERROR 42000: A table must have at least 1 column CREATE TABLE mysql.mysql_table_2 (a INT) ENGINE=ROCKSDB; CREATE TABLE mysql_primkey (a INT PRIMARY KEY, b INT, c INT, d INT, INDEX (c)) ENGINE=ROCKSDB; ALTER TABLE mysql_primkey DROP b, DROP a, ADD (f INT PRIMARY KEY); @@ -29,10 +29,24 @@ DROP INDEX `PRIMARY` ON mysql_primkey4; ERROR HY000: Table without primary key cannot be created outside mysql schema. ALTER TABLE mysql.mysql_table ADD PRIMARY KEY (a); ALTER TABLE mysql.mysql_table DROP PRIMARY KEY; +SET default_storage_engine=ROCKSDB; +CREATE TABLE mysql_noeng(a INT, b INT); +ERROR HY000: Table without primary key cannot be created outside mysql schema. +SET sql_mode=""; +CREATE TABLE mysql_noeng_sub(a INT, b INT) ENGINE=BOGUS_ENGINE; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey5 LIKE mysql_primkey; +SET @@global.block_create_no_primary_key = false; +CREATE TABLE mysql_no_primkey (a INT) ENGINE=ROCKSDB; +SET @@global.block_create_no_primary_key = true; +CREATE TABLE mysql_block_no_primkey LIKE mysql_no_primkey; +ERROR HY000: Table without primary key cannot be created outside mysql schema. DROP TABLE mysql_primkey; DROP TABLE mysql_primkey2; DROP TABLE mysql_primkey3; DROP TABLE mysql_primkey4; +DROP TABLE mysql_primkey5; +DROP TABLE mysql_no_primkey; USE mysql; DROP TABLE mysql_table; DROP TABLE mysql_table_2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result index 50733f81598..1e2636c873a 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result @@ -36,8 +36,8 @@ connection: default (for show processlist) # both con1 and default exist show processlist; Id User Host db Command Time State Info Rows examined Rows sent Tid Srv_Id - root test