/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of TokuDB
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
TokuDBis is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
TokuDB is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with TokuDB. If not, see .
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "hatoku_cmp.h"
#ifdef WORDS_BIGENDIAN
#error "WORDS_BIGENDIAN not supported"
#endif
// returns true if the field is a valid field to be used
// in a TokuDB table. The non-valid fields are those
// that have been deprecated since before 5.1, and can
// only exist through upgrades of old versions of MySQL
static bool field_valid_for_tokudb_table(Field* field) {
bool ret_val = false;
enum_field_types mysql_type = field->real_type();
switch (mysql_type) {
case MYSQL_TYPE_LONG:
case MYSQL_TYPE_LONGLONG:
case MYSQL_TYPE_TINY:
case MYSQL_TYPE_SHORT:
case MYSQL_TYPE_INT24:
case MYSQL_TYPE_DATE:
case MYSQL_TYPE_YEAR:
case MYSQL_TYPE_NEWDATE:
case MYSQL_TYPE_ENUM:
case MYSQL_TYPE_SET:
case MYSQL_TYPE_TIME:
case MYSQL_TYPE_DATETIME:
case MYSQL_TYPE_TIMESTAMP:
case MYSQL_TYPE_DOUBLE:
case MYSQL_TYPE_FLOAT:
#if (50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
(50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) || \
(100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100199)
case MYSQL_TYPE_DATETIME2:
case MYSQL_TYPE_TIMESTAMP2:
case MYSQL_TYPE_TIME2:
#endif
case MYSQL_TYPE_NEWDECIMAL:
case MYSQL_TYPE_BIT:
case MYSQL_TYPE_STRING:
case MYSQL_TYPE_VARCHAR:
case MYSQL_TYPE_TINY_BLOB:
case MYSQL_TYPE_MEDIUM_BLOB:
case MYSQL_TYPE_BLOB:
case MYSQL_TYPE_LONG_BLOB:
ret_val = true;
goto exit;
//
// I believe these are old types that are no longer
// in any 5.1 tables, so tokudb does not need
// to worry about them
// Putting in this assert in case I am wrong.
// Do not support geometry yet.
//
case MYSQL_TYPE_GEOMETRY:
case MYSQL_TYPE_DECIMAL:
case MYSQL_TYPE_VAR_STRING:
case MYSQL_TYPE_NULL:
ret_val = false;
}
exit:
return ret_val;
}
static void get_var_field_info(
uint32_t* field_len, // output: length of field
uint32_t* start_offset, // output, length of offset where data starts
uint32_t var_field_index, //input, index of var field we want info on
const uchar* var_field_offset_ptr, //input, pointer to where offset information for all var fields begins
uint32_t num_offset_bytes //input, number of bytes used to store offsets starting at var_field_offset_ptr
)
{
uint32_t data_start_offset = 0;
uint32_t data_end_offset = 0;
switch (num_offset_bytes) {
case (1):
data_end_offset = (var_field_offset_ptr + var_field_index)[0];
break;
case (2):
data_end_offset = uint2korr(var_field_offset_ptr + 2*var_field_index);
break;
default:
assert_unreachable();
}
if (var_field_index) {
switch (num_offset_bytes) {
case (1):
data_start_offset = (var_field_offset_ptr + var_field_index - 1)[0];
break;
case (2):
data_start_offset = uint2korr(var_field_offset_ptr + 2*(var_field_index-1));
break;
default:
assert_unreachable();
}
}
else {
data_start_offset = 0;
}
*start_offset = data_start_offset;
assert_always(data_end_offset >= data_start_offset);
*field_len = data_end_offset - data_start_offset;
}
static void get_blob_field_info(
uint32_t* start_offset,
uint32_t len_of_offsets,
const uchar* var_field_data_ptr,
uint32_t num_offset_bytes
)
{
uint32_t data_end_offset;
//
// need to set var_field_data_ptr to point to beginning of blobs, which
// is at the end of the var stuff (if they exist), if var stuff does not exist
// then the bottom variable will be 0, and var_field_data_ptr is already
// set correctly
//
if (len_of_offsets) {
switch (num_offset_bytes) {
case (1):
data_end_offset = (var_field_data_ptr - 1)[0];
break;
case (2):
data_end_offset = uint2korr(var_field_data_ptr - 2);
break;
default:
assert_unreachable();
}
}
else {
data_end_offset = 0;
}
*start_offset = data_end_offset;
}
// this function is pattern matched from
// InnoDB's get_innobase_type_from_mysql_type
static TOKU_TYPE mysql_to_toku_type (Field* field) {
TOKU_TYPE ret_val = toku_type_unknown;
enum_field_types mysql_type = field->real_type();
switch (mysql_type) {
case MYSQL_TYPE_LONG:
case MYSQL_TYPE_LONGLONG:
case MYSQL_TYPE_TINY:
case MYSQL_TYPE_SHORT:
case MYSQL_TYPE_INT24:
case MYSQL_TYPE_DATE:
case MYSQL_TYPE_YEAR:
case MYSQL_TYPE_NEWDATE:
case MYSQL_TYPE_ENUM:
case MYSQL_TYPE_SET:
ret_val = toku_type_int;
goto exit;
case MYSQL_TYPE_TIME:
case MYSQL_TYPE_DATETIME:
case MYSQL_TYPE_TIMESTAMP:
#ifdef MARIADB_BASE_VERSION
// case to handle fractional seconds in MariaDB
//
if (field->key_type() == HA_KEYTYPE_BINARY) {
ret_val = toku_type_fixbinary;
goto exit;
}
#endif
ret_val = toku_type_int;
goto exit;
case MYSQL_TYPE_DOUBLE:
ret_val = toku_type_double;
goto exit;
case MYSQL_TYPE_FLOAT:
ret_val = toku_type_float;
goto exit;
#if (50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
(50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) || \
(100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100199)
case MYSQL_TYPE_DATETIME2:
case MYSQL_TYPE_TIMESTAMP2:
case MYSQL_TYPE_TIME2:
#endif
case MYSQL_TYPE_NEWDECIMAL:
case MYSQL_TYPE_BIT:
ret_val = toku_type_fixbinary;
goto exit;
case MYSQL_TYPE_STRING:
if (field->binary()) {
ret_val = toku_type_fixbinary;
}
else {
ret_val = toku_type_fixstring;
}
goto exit;
case MYSQL_TYPE_VARCHAR:
if (field->binary()) {
ret_val = toku_type_varbinary;
}
else {
ret_val = toku_type_varstring;
}
goto exit;
case MYSQL_TYPE_TINY_BLOB:
case MYSQL_TYPE_MEDIUM_BLOB:
case MYSQL_TYPE_BLOB:
case MYSQL_TYPE_LONG_BLOB:
ret_val = toku_type_blob;
goto exit;
//
// I believe these are old types that are no longer
// in any 5.1 tables, so tokudb does not need
// to worry about them
// Putting in this assert in case I am wrong.
// Do not support geometry yet.
//
case MYSQL_TYPE_GEOMETRY:
case MYSQL_TYPE_DECIMAL:
case MYSQL_TYPE_VAR_STRING:
case MYSQL_TYPE_NULL:
assert_unreachable();
}
exit:
return ret_val;
}
static inline CHARSET_INFO* get_charset_from_num (uint32_t charset_number) {
//
// patternmatched off of InnoDB, due to MySQL bug 42649
//
if (charset_number == default_charset_info->number) {
return default_charset_info;
}
else if (charset_number == my_charset_latin1.number) {
return &my_charset_latin1;
}
else {
return get_charset(charset_number, MYF(MY_WME));
}
}
//
// used to read the length of a variable sized field in a tokudb key (buf).
//
static inline uint32_t get_length_from_var_tokudata (uchar* buf, uint32_t length_bytes) {
uint32_t length = (uint32_t)(buf[0]);
if (length_bytes == 2) {
uint32_t rest_of_length = (uint32_t)buf[1];
length += rest_of_length<<8;
}
return length;
}
//
// used to deduce the number of bytes used to store the length of a varstring/varbinary
// in a key field stored in tokudb
//
static inline uint32_t get_length_bytes_from_max(uint32_t max_num_bytes) {
return (max_num_bytes > 255) ? 2 : 1;
}
//
// assuming MySQL in little endian, and we are storing in little endian
//
static inline uchar* pack_toku_int (uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) {
switch (num_bytes) {
case (1):
memcpy(to_tokudb, from_mysql, 1);
break;
case (2):
memcpy(to_tokudb, from_mysql, 2);
break;
case (3):
memcpy(to_tokudb, from_mysql, 3);
break;
case (4):
memcpy(to_tokudb, from_mysql, 4);
break;
case (8):
memcpy(to_tokudb, from_mysql, 8);
break;
default:
assert_unreachable();
}
return to_tokudb+num_bytes;
}
//
// assuming MySQL in little endian, and we are unpacking to little endian
//
static inline uchar* unpack_toku_int(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) {
switch (num_bytes) {
case (1):
memcpy(to_mysql, from_tokudb, 1);
break;
case (2):
memcpy(to_mysql, from_tokudb, 2);
break;
case (3):
memcpy(to_mysql, from_tokudb, 3);
break;
case (4):
memcpy(to_mysql, from_tokudb, 4);
break;
case (8):
memcpy(to_mysql, from_tokudb, 8);
break;
default:
assert_unreachable();
}
return from_tokudb+num_bytes;
}
static inline int cmp_toku_int (uchar* a_buf, uchar* b_buf, bool is_unsigned, uint32_t num_bytes) {
int ret_val = 0;
//
// case for unsigned integers
//
if (is_unsigned) {
uint32_t a_num, b_num = 0;
uint64_t a_big_num, b_big_num = 0;
switch (num_bytes) {
case (1):
a_num = *a_buf;
b_num = *b_buf;
ret_val = a_num-b_num;
goto exit;
case (2):
a_num = uint2korr(a_buf);
b_num = uint2korr(b_buf);
ret_val = a_num-b_num;
goto exit;
case (3):
a_num = tokudb_uint3korr(a_buf);
b_num = tokudb_uint3korr(b_buf);
ret_val = a_num-b_num;
goto exit;
case (4):
a_num = uint4korr(a_buf);
b_num = uint4korr(b_buf);
if (a_num < b_num) {
ret_val = -1; goto exit;
}
if (a_num > b_num) {
ret_val = 1; goto exit;
}
ret_val = 0;
goto exit;
case (8):
a_big_num = uint8korr(a_buf);
b_big_num = uint8korr(b_buf);
if (a_big_num < b_big_num) {
ret_val = -1; goto exit;
}
else if (a_big_num > b_big_num) {
ret_val = 1; goto exit;
}
ret_val = 0;
goto exit;
default:
assert_unreachable();
}
}
//
// case for signed integers
//
else {
int32_t a_num, b_num = 0;
int64_t a_big_num, b_big_num = 0;
switch (num_bytes) {
case (1):
a_num = *(signed char *)a_buf;
b_num = *(signed char *)b_buf;
ret_val = a_num-b_num;
goto exit;
case (2):
a_num = sint2korr(a_buf);
b_num = sint2korr(b_buf);
ret_val = a_num-b_num;
goto exit;
case (3):
a_num = sint3korr(a_buf);
b_num = sint3korr(b_buf);
ret_val = a_num - b_num;
goto exit;
case (4):
a_num = sint4korr(a_buf);
b_num = sint4korr(b_buf);
if (a_num < b_num) {
ret_val = -1; goto exit;
}
if (a_num > b_num) {
ret_val = 1; goto exit;
}
ret_val = 0;
goto exit;
case (8):
a_big_num = sint8korr(a_buf);
b_big_num = sint8korr(b_buf);
if (a_big_num < b_big_num) {
ret_val = -1; goto exit;
}
else if (a_big_num > b_big_num) {
ret_val = 1; goto exit;
}
ret_val = 0;
goto exit;
default:
assert_unreachable();
}
}
//
// if this is hit, indicates bug in writing of this function
//
assert_unreachable();
exit:
return ret_val;
}
static inline uchar* pack_toku_double (uchar* to_tokudb, uchar* from_mysql) {
memcpy(to_tokudb, from_mysql, sizeof(double));
return to_tokudb + sizeof(double);
}
static inline uchar* unpack_toku_double(uchar* to_mysql, uchar* from_tokudb) {
memcpy(to_mysql, from_tokudb, sizeof(double));
return from_tokudb + sizeof(double);
}
static inline int cmp_toku_double(uchar* a_buf, uchar* b_buf) {
int ret_val;
double a_num;
double b_num;
doubleget(a_num, a_buf);
doubleget(b_num, b_buf);
if (a_num < b_num) {
ret_val = -1;
goto exit;
}
else if (a_num > b_num) {
ret_val = 1;
goto exit;
}
ret_val = 0;
exit:
return ret_val;
}
static inline uchar* pack_toku_float (uchar* to_tokudb, uchar* from_mysql) {
memcpy(to_tokudb, from_mysql, sizeof(float));
return to_tokudb + sizeof(float);
}
static inline uchar* unpack_toku_float(uchar* to_mysql, uchar* from_tokudb) {
memcpy(to_mysql, from_tokudb, sizeof(float));
return from_tokudb + sizeof(float);
}
static inline int cmp_toku_float(uchar* a_buf, uchar* b_buf) {
int ret_val;
float a_num;
float b_num;
//
// This is the way Field_float::cmp gets the floats from the buffers
//
memcpy(&a_num, a_buf, sizeof(float));
memcpy(&b_num, b_buf, sizeof(float));
if (a_num < b_num) {
ret_val = -1;
goto exit;
}
else if (a_num > b_num) {
ret_val = 1;
goto exit;
}
ret_val = 0;
exit:
return ret_val;
}
static inline uchar* pack_toku_binary(uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) {
memcpy(to_tokudb, from_mysql, num_bytes);
return to_tokudb + num_bytes;
}
static inline uchar* unpack_toku_binary(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) {
memcpy(to_mysql, from_tokudb, num_bytes);
return from_tokudb + num_bytes;
}
static inline int cmp_toku_binary(
uchar* a_buf,
uint32_t a_num_bytes,
uchar* b_buf,
uint32_t b_num_bytes
)
{
int ret_val = 0;
uint32_t num_bytes_to_cmp = (a_num_bytes < b_num_bytes) ? a_num_bytes : b_num_bytes;
ret_val = memcmp(a_buf, b_buf, num_bytes_to_cmp);
if ((ret_val != 0) || (a_num_bytes == b_num_bytes)) {
goto exit;
}
if (a_num_bytes < b_num_bytes) {
ret_val = -1;
goto exit;
}
else {
ret_val = 1;
goto exit;
}
exit:
return ret_val;
}
//
// partially copied from below
//
static uchar* pack_toku_varbinary_from_desc(
uchar* to_tokudb,
const uchar* from_desc,
uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb
uint32_t field_length //length of field
)
{
uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length);
uint32_t length = field_length;
set_if_smaller(length, key_part_length);
//
// copy the length bytes, assuming both are in little endian
//
to_tokudb[0] = (uchar)length & 255;
if (length_bytes_in_tokudb > 1) {
to_tokudb[1] = (uchar) (length >> 8);
}
//
// copy the string
//
memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length);
return to_tokudb + length + length_bytes_in_tokudb;
}
static inline uchar* pack_toku_varbinary(
uchar* to_tokudb,
uchar* from_mysql,
uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql
uint32_t max_num_bytes
)
{
uint32_t length = 0;
uint32_t length_bytes_in_tokudb;
switch (length_bytes_in_mysql) {
case (0):
length = max_num_bytes;
break;
case (1):
length = (uint32_t)(*from_mysql);
break;
case (2):
length = uint2korr(from_mysql);
break;
case (3):
length = tokudb_uint3korr(from_mysql);
break;
case (4):
length = uint4korr(from_mysql);
break;
}
//
// from this point on, functionality equivalent to pack_toku_varbinary_from_desc
//
set_if_smaller(length,max_num_bytes);
length_bytes_in_tokudb = get_length_bytes_from_max(max_num_bytes);
//
// copy the length bytes, assuming both are in little endian
//
to_tokudb[0] = (uchar)length & 255;
if (length_bytes_in_tokudb > 1) {
to_tokudb[1] = (uchar) (length >> 8);
}
//
// copy the string
//
memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length);
return to_tokudb + length + length_bytes_in_tokudb;
}
static inline uchar* unpack_toku_varbinary(
uchar* to_mysql,
uchar* from_tokudb,
uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb
uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql
)
{
uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb);
//
// copy the length into the mysql buffer
//
switch (length_bytes_in_mysql) {
case (0):
break;
case (1):
*to_mysql = (uchar) length;
break;
case (2):
int2store(to_mysql, length);
break;
case (3):
int3store(to_mysql, length);
break;
case (4):
int4store(to_mysql, length);
break;
default:
assert_unreachable();
}
//
// copy the binary data
//
memcpy(to_mysql + length_bytes_in_mysql, from_tokudb + length_bytes_in_tokudb, length);
return from_tokudb + length_bytes_in_tokudb+ length;
}
static inline int cmp_toku_varbinary(
uchar* a_buf,
uchar* b_buf,
uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf
uint32_t* a_bytes_read,
uint32_t* b_bytes_read
)
{
int ret_val = 0;
uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes);
uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes);
ret_val = cmp_toku_binary(
a_buf + length_bytes,
a_len,
b_buf + length_bytes,
b_len
);
*a_bytes_read = a_len + length_bytes;
*b_bytes_read = b_len + length_bytes;
return ret_val;
}
static inline uchar* pack_toku_blob(
uchar* to_tokudb,
uchar* from_mysql,
uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb
uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql
uint32_t max_num_bytes,
#if MYSQL_VERSION_ID >= 50600
const CHARSET_INFO* charset
#else
CHARSET_INFO* charset
#endif
)
{
uint32_t length = 0;
uint32_t local_char_length = 0;
uchar* blob_buf = NULL;
switch (length_bytes_in_mysql) {
case (0):
length = max_num_bytes;
break;
case (1):
length = (uint32_t)(*from_mysql);
break;
case (2):
length = uint2korr(from_mysql);
break;
case (3):
length = tokudb_uint3korr(from_mysql);
break;
case (4):
length = uint4korr(from_mysql);
break;
}
set_if_smaller(length,max_num_bytes);
memcpy(&blob_buf,from_mysql+length_bytes_in_mysql,sizeof(uchar *));
local_char_length= ((charset->mbmaxlen > 1) ?
max_num_bytes/charset->mbmaxlen : max_num_bytes);
if (length > local_char_length)
{
local_char_length= my_charpos(
charset,
blob_buf,
blob_buf+length,
local_char_length
);
set_if_smaller(length, local_char_length);
}
//
// copy the length bytes, assuming both are in little endian
//
to_tokudb[0] = (uchar)length & 255;
if (length_bytes_in_tokudb > 1) {
to_tokudb[1] = (uchar) (length >> 8);
}
//
// copy the string
//
memcpy(to_tokudb + length_bytes_in_tokudb, blob_buf, length);
return to_tokudb + length + length_bytes_in_tokudb;
}
static inline uchar* unpack_toku_blob(
uchar* to_mysql,
uchar* from_tokudb,
uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb
uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql
)
{
uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb);
uchar* blob_pos = NULL;
//
// copy the length into the mysql buffer
//
switch (length_bytes_in_mysql) {
case (0):
break;
case (1):
*to_mysql = (uchar) length;
break;
case (2):
int2store(to_mysql, length);
break;
case (3):
int3store(to_mysql, length);
break;
case (4):
int4store(to_mysql, length);
break;
default:
assert_unreachable();
}
//
// copy the binary data
//
blob_pos = from_tokudb + length_bytes_in_tokudb;
memcpy(to_mysql + length_bytes_in_mysql, &blob_pos, sizeof(uchar *));
return from_tokudb + length_bytes_in_tokudb+ length;
}
//
// partially copied from below
//
static uchar* pack_toku_varstring_from_desc(
uchar* to_tokudb,
const uchar* from_desc,
uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb
uint32_t field_length,
uint32_t charset_num//length of field
)
{
CHARSET_INFO* charset = NULL;
uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length);
uint32_t length = field_length;
uint32_t local_char_length = 0;
set_if_smaller(length, key_part_length);
charset = get_charset_from_num(charset_num);
//
// copy the string
//
local_char_length= ((charset->mbmaxlen > 1) ?
key_part_length/charset->mbmaxlen : key_part_length);
if (length > local_char_length)
{
local_char_length= my_charpos(
charset,
from_desc,
from_desc+length,
local_char_length
);
set_if_smaller(length, local_char_length);
}
//
// copy the length bytes, assuming both are in little endian
//
to_tokudb[0] = (uchar)length & 255;
if (length_bytes_in_tokudb > 1) {
to_tokudb[1] = (uchar) (length >> 8);
}
//
// copy the string
//
memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length);
return to_tokudb + length + length_bytes_in_tokudb;
}
static inline uchar* pack_toku_varstring(
uchar* to_tokudb,
uchar* from_mysql,
uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb
uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql
uint32_t max_num_bytes,
#if MYSQL_VERSION_ID >= 50600
const CHARSET_INFO *charset
#else
CHARSET_INFO* charset
#endif
)
{
uint32_t length = 0;
uint32_t local_char_length = 0;
switch (length_bytes_in_mysql) {
case (0):
length = max_num_bytes;
break;
case (1):
length = (uint32_t)(*from_mysql);
break;
case (2):
length = uint2korr(from_mysql);
break;
case (3):
length = tokudb_uint3korr(from_mysql);
break;
case (4):
length = uint4korr(from_mysql);
break;
}
set_if_smaller(length,max_num_bytes);
local_char_length= ((charset->mbmaxlen > 1) ?
max_num_bytes/charset->mbmaxlen : max_num_bytes);
if (length > local_char_length)
{
local_char_length= my_charpos(
charset,
from_mysql+length_bytes_in_mysql,
from_mysql+length_bytes_in_mysql+length,
local_char_length
);
set_if_smaller(length, local_char_length);
}
//
// copy the length bytes, assuming both are in little endian
//
to_tokudb[0] = (uchar)length & 255;
if (length_bytes_in_tokudb > 1) {
to_tokudb[1] = (uchar) (length >> 8);
}
//
// copy the string
//
memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length);
return to_tokudb + length + length_bytes_in_tokudb;
}
static inline int cmp_toku_string(
uchar* a_buf,
uint32_t a_num_bytes,
uchar* b_buf,
uint32_t b_num_bytes,
uint32_t charset_number
)
{
int ret_val = 0;
CHARSET_INFO* charset = NULL;
charset = get_charset_from_num(charset_number);
ret_val = charset->coll->strnncollsp(
charset,
a_buf,
a_num_bytes,
b_buf,
b_num_bytes,
0
);
return ret_val;
}
static inline int cmp_toku_varstring(
uchar* a_buf,
uchar* b_buf,
uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf
uint32_t charset_num,
uint32_t* a_bytes_read,
uint32_t* b_bytes_read
)
{
int ret_val = 0;
uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes);
uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes);
ret_val = cmp_toku_string(
a_buf + length_bytes,
a_len,
b_buf + length_bytes,
b_len,
charset_num
);
*a_bytes_read = a_len + length_bytes;
*b_bytes_read = b_len + length_bytes;
return ret_val;
}
static inline int tokudb_compare_two_hidden_keys(
const void* new_key_data,
const uint32_t new_key_size,
const void* saved_key_data,
const uint32_t saved_key_size
) {
assert_always(
(new_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH) &&
(saved_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH));
ulonglong a = hpk_char_to_num((uchar *) new_key_data);
ulonglong b = hpk_char_to_num((uchar *) saved_key_data);
return a < b ? -1 : (a > b ? 1 : 0);
}
//
// Returns number of bytes used for a given TOKU_TYPE
// in a key descriptor. The number of bytes returned
// here MUST match the number of bytes used for the encoding
// in create_toku_key_descriptor_for_key
// Parameters:
// [in] row_desc - buffer that contains portion of descriptor
// created in create_toku_key_descriptor_for_key. The first
// byte points to the TOKU_TYPE.
//
static uint32_t skip_field_in_descriptor(uchar* row_desc) {
uchar* row_desc_pos = row_desc;
TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0];
row_desc_pos++;
switch (toku_type) {
case (toku_type_hpk):
case (toku_type_double):
case (toku_type_float):
break;
case (toku_type_int):
row_desc_pos += 2;
break;
case (toku_type_fixbinary):
case (toku_type_varbinary):
row_desc_pos++;
break;
case (toku_type_fixstring):
case (toku_type_varstring):
case (toku_type_blob):
row_desc_pos++;
row_desc_pos += sizeof(uint32_t);
break;
default:
assert_unreachable();
}
return (uint32_t)(row_desc_pos - row_desc);
}
//
// outputs a descriptor for key into buf. Returns number of bytes used in buf
// to store the descriptor. Number of bytes used MUST match number of bytes
// we would skip in skip_field_in_descriptor
//
static int create_toku_key_descriptor_for_key(KEY* key, uchar* buf) {
uchar* pos = buf;
uint32_t num_bytes_in_field = 0;
uint32_t charset_num = 0;
for (uint i = 0; i < key->user_defined_key_parts; i++) {
Field* field = key->key_part[i].field;
//
// The first byte states if there is a null byte
// 0 means no null byte, non-zer means there
// is one
//
*pos = field->null_bit;
pos++;
//
// The second byte for each field is the type
//
TOKU_TYPE type = mysql_to_toku_type(field);
assert_always((int)type < 256);
*pos = (uchar)(type & 255);
pos++;
//
// based on the type, extra data follows afterwards
//
switch (type) {
//
// two bytes follow for ints, first one states how many
// bytes the int is (1 , 2, 3, 4 or 8)
// next one states if it is signed or not
//
case (toku_type_int):
num_bytes_in_field = field->pack_length();
assert_always (num_bytes_in_field < 256);
*pos = (uchar)(num_bytes_in_field & 255);
pos++;
*pos = (field->flags & UNSIGNED_FLAG) ? 1 : 0;
pos++;
break;
//
// nothing follows floats and doubles
//
case (toku_type_double):
case (toku_type_float):
break;
//
// one byte follow stating the length of the field
//
case (toku_type_fixbinary):
num_bytes_in_field = field->pack_length();
set_if_smaller(num_bytes_in_field, key->key_part[i].length);
assert_always(num_bytes_in_field < 256);
pos[0] = (uchar)(num_bytes_in_field & 255);
pos++;
break;
//
// one byte follows: the number of bytes used to encode the length
//
case (toku_type_varbinary):
*pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255);
pos++;
break;
//
// five bytes follow: one for the number of bytes to encode the length,
// four for the charset number
//
case (toku_type_fixstring):
case (toku_type_varstring):
case (toku_type_blob):
*pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255);
pos++;
charset_num = field->charset()->number;
pos[0] = (uchar)(charset_num & 255);
pos[1] = (uchar)((charset_num >> 8) & 255);
pos[2] = (uchar)((charset_num >> 16) & 255);
pos[3] = (uchar)((charset_num >> 24) & 255);
pos += 4;
break;
default:
assert_unreachable();
}
}
return pos - buf;
}
//
// Creates a descriptor for a DB. That contains all information necessary
// to do both key comparisons and data comparisons (for dup-sort databases).
//
// There are two types of descriptors we care about:
// 1) Primary key, (in a no-dup database)
// 2) secondary keys, which are a secondary key followed by a primary key,
// but in a no-dup database.
//
// I realize this may be confusing, but here is how it works.
// All DB's have a key compare.
// The format of the descriptor must be able to handle both.
//
// The first four bytes store an offset into the descriptor to the second piece
// used for data comparisons. So, if in the future we want to append something
// to the descriptor, we can.
//
//
static int create_toku_key_descriptor(
uchar* buf,
bool is_first_hpk,
KEY* first_key,
bool is_second_hpk,
KEY* second_key
)
{
//
// The first four bytes always contain the offset of where the first key
// ends.
//
uchar* pos = buf + 4;
uint32_t num_bytes = 0;
uint32_t offset = 0;
if (is_first_hpk) {
pos[0] = 0; //say there is NO infinity byte
pos[1] = 0; //field cannot be NULL, stating it
pos[2] = toku_type_hpk;
pos += 3;
}
else {
//
// first key is NOT a hidden primary key, so we now pack first_key
//
pos[0] = 1; //say there is an infinity byte
pos++;
num_bytes = create_toku_key_descriptor_for_key(first_key, pos);
pos += num_bytes;
}
//
// if we do not have a second key, we can jump to exit right now
// we do not have a second key if it is not a hidden primary key
// and if second_key is NULL
//
if (is_first_hpk || (!is_second_hpk && (second_key == NULL)) ) {
goto exit;
}
//
// if we have a second key, and it is an hpk, we need to pack it, and
// write in the offset to this position in the first four bytes
//
if (is_second_hpk) {
pos[0] = 0; //field cannot be NULL, stating it
pos[1] = toku_type_hpk;
pos += 2;
}
else {
//
// second key is NOT a hidden primary key, so we now pack second_key
//
num_bytes = create_toku_key_descriptor_for_key(second_key, pos);
pos += num_bytes;
}
exit:
offset = pos - buf;
buf[0] = (uchar)(offset & 255);
buf[1] = (uchar)((offset >> 8) & 255);
buf[2] = (uchar)((offset >> 16) & 255);
buf[3] = (uchar)((offset >> 24) & 255);
return pos - buf;
}
static inline int compare_toku_field(
uchar* a_buf,
uchar* b_buf,
uchar* row_desc,
uint32_t* a_bytes_read,
uint32_t* b_bytes_read,
uint32_t* row_desc_bytes_read,
bool* read_string
)
{
int ret_val = 0;
uchar* row_desc_pos = row_desc;
uint32_t num_bytes = 0;
uint32_t length_bytes = 0;
uint32_t charset_num = 0;
bool is_unsigned = false;
TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0];
row_desc_pos++;
switch (toku_type) {
case (toku_type_hpk):
ret_val = tokudb_compare_two_hidden_keys(
a_buf,
TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
b_buf,
TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
);
*a_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
*b_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
break;
case (toku_type_int):
num_bytes = row_desc_pos[0];
is_unsigned = row_desc_pos[1];
ret_val = cmp_toku_int(
a_buf,
b_buf,
is_unsigned,
num_bytes
);
*a_bytes_read = num_bytes;
*b_bytes_read = num_bytes;
row_desc_pos += 2;
break;
case (toku_type_double):
ret_val = cmp_toku_double(a_buf, b_buf);
*a_bytes_read = sizeof(double);
*b_bytes_read = sizeof(double);
break;
case (toku_type_float):
ret_val = cmp_toku_float(a_buf, b_buf);
*a_bytes_read = sizeof(float);
*b_bytes_read = sizeof(float);
break;
case (toku_type_fixbinary):
num_bytes = row_desc_pos[0];
ret_val = cmp_toku_binary(a_buf, num_bytes, b_buf,num_bytes);
*a_bytes_read = num_bytes;
*b_bytes_read = num_bytes;
row_desc_pos++;
break;
case (toku_type_varbinary):
length_bytes = row_desc_pos[0];
ret_val = cmp_toku_varbinary(
a_buf,
b_buf,
length_bytes,
a_bytes_read,
b_bytes_read
);
row_desc_pos++;
break;
case (toku_type_fixstring):
case (toku_type_varstring):
case (toku_type_blob):
length_bytes = row_desc_pos[0];
row_desc_pos++;
//
// not sure we want to read charset_num like this
//
charset_num = *(uint32_t *)row_desc_pos;
row_desc_pos += sizeof(uint32_t);
ret_val = cmp_toku_varstring(
a_buf,
b_buf,
length_bytes,
charset_num,
a_bytes_read,
b_bytes_read
);
*read_string = true;
break;
default:
assert_unreachable();
}
*row_desc_bytes_read = row_desc_pos - row_desc;
return ret_val;
}
//
// packs a field from a MySQL buffer into a tokudb buffer.
// Used for inserts/updates
//
static uchar* pack_toku_key_field(
uchar* to_tokudb,
uchar* from_mysql,
Field* field,
uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff
)
{
uchar* new_pos = NULL;
uint32_t num_bytes = 0;
TOKU_TYPE toku_type = mysql_to_toku_type(field);
switch(toku_type) {
case (toku_type_int):
assert_always(key_part_length == field->pack_length());
new_pos = pack_toku_int(
to_tokudb,
from_mysql,
field->pack_length()
);
goto exit;
case (toku_type_double):
assert_always(field->pack_length() == sizeof(double));
assert_always(key_part_length == sizeof(double));
new_pos = pack_toku_double(to_tokudb, from_mysql);
goto exit;
case (toku_type_float):
assert_always(field->pack_length() == sizeof(float));
assert_always(key_part_length == sizeof(float));
new_pos = pack_toku_float(to_tokudb, from_mysql);
goto exit;
case (toku_type_fixbinary):
num_bytes = field->pack_length();
set_if_smaller(num_bytes, key_part_length);
new_pos = pack_toku_binary(
to_tokudb,
from_mysql,
num_bytes
);
goto exit;
case (toku_type_fixstring):
num_bytes = field->pack_length();
set_if_smaller(num_bytes, key_part_length);
new_pos = pack_toku_varstring(
to_tokudb,
from_mysql,
get_length_bytes_from_max(key_part_length),
0,
num_bytes,
field->charset()
);
goto exit;
case (toku_type_varbinary):
new_pos = pack_toku_varbinary(
to_tokudb,
from_mysql,
((Field_varstring *)field)->length_bytes,
key_part_length
);
goto exit;
case (toku_type_varstring):
new_pos = pack_toku_varstring(
to_tokudb,
from_mysql,
get_length_bytes_from_max(key_part_length),
((Field_varstring *)field)->length_bytes,
key_part_length,
field->charset()
);
goto exit;
case (toku_type_blob):
new_pos = pack_toku_blob(
to_tokudb,
from_mysql,
get_length_bytes_from_max(key_part_length),
((Field_blob *)field)->row_pack_length(), //only calling this because packlength is returned
key_part_length,
field->charset()
);
goto exit;
default:
assert_unreachable();
}
assert_unreachable();
exit:
return new_pos;
}
//
// packs a field from a MySQL buffer into a tokudb buffer.
// Used for queries. The only difference between this function
// and pack_toku_key_field is that all variable sized columns
// use 2 bytes to encode the length, regardless of the field
// So varchar(4) will still use 2 bytes to encode the field
//
static uchar* pack_key_toku_key_field(
uchar* to_tokudb,
uchar* from_mysql,
Field* field,
uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff
)
{
uchar* new_pos = NULL;
TOKU_TYPE toku_type = mysql_to_toku_type(field);
switch(toku_type) {
case (toku_type_int):
case (toku_type_double):
case (toku_type_float):
case (toku_type_fixbinary):
case (toku_type_fixstring):
new_pos = pack_toku_key_field(to_tokudb, from_mysql, field, key_part_length);
goto exit;
case (toku_type_varbinary):
new_pos = pack_toku_varbinary(
to_tokudb,
from_mysql,
2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field
key_part_length
);
goto exit;
case (toku_type_varstring):
case (toku_type_blob):
new_pos = pack_toku_varstring(
to_tokudb,
from_mysql,
get_length_bytes_from_max(key_part_length),
2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field
key_part_length,
field->charset()
);
goto exit;
default:
assert_unreachable();
}
assert_unreachable();
exit:
return new_pos;
}
uchar* unpack_toku_key_field(
uchar* to_mysql,
uchar* from_tokudb,
Field* field,
uint32_t key_part_length) {
uchar* new_pos = NULL;
uint32_t num_bytes = 0;
uint32_t num_bytes_copied;
TOKU_TYPE toku_type = mysql_to_toku_type(field);
switch(toku_type) {
case (toku_type_int):
assert_always(key_part_length == field->pack_length());
new_pos = unpack_toku_int(
to_mysql,
from_tokudb,
field->pack_length()
);
goto exit;
case (toku_type_double):
assert_always(field->pack_length() == sizeof(double));
assert_always(key_part_length == sizeof(double));
new_pos = unpack_toku_double(to_mysql, from_tokudb);
goto exit;
case (toku_type_float):
assert_always(field->pack_length() == sizeof(float));
assert_always(key_part_length == sizeof(float));
new_pos = unpack_toku_float(to_mysql, from_tokudb);
goto exit;
case (toku_type_fixbinary):
num_bytes = field->pack_length();
set_if_smaller(num_bytes, key_part_length);
new_pos = unpack_toku_binary(
to_mysql,
from_tokudb,
num_bytes);
goto exit;
case (toku_type_fixstring):
num_bytes = field->pack_length();
new_pos = unpack_toku_varbinary(
to_mysql,
from_tokudb,
get_length_bytes_from_max(key_part_length),
0);
num_bytes_copied =
new_pos -
(from_tokudb + get_length_bytes_from_max(key_part_length));
assert_always(num_bytes_copied <= num_bytes);
memset(
to_mysql + num_bytes_copied,
field->charset()->pad_char,
num_bytes - num_bytes_copied);
goto exit;
case (toku_type_varbinary):
case (toku_type_varstring):
new_pos = unpack_toku_varbinary(
to_mysql,
from_tokudb,
get_length_bytes_from_max(key_part_length),
((Field_varstring*)field)->length_bytes);
goto exit;
case (toku_type_blob):
new_pos = unpack_toku_blob(
to_mysql,
from_tokudb,
get_length_bytes_from_max(key_part_length),
//only calling this because packlength is returned
((Field_blob *)field)->row_pack_length());
goto exit;
default:
assert_unreachable();
}
assert_unreachable();
exit:
return new_pos;
}
static int tokudb_compare_two_keys(
const void* new_key_data,
const uint32_t new_key_size,
const void* saved_key_data,
const uint32_t saved_key_size,
const void* row_desc,
const uint32_t row_desc_size,
bool cmp_prefix,
bool* read_string) {
int ret_val = 0;
int8_t new_key_inf_val = COL_NEG_INF;
int8_t saved_key_inf_val = COL_NEG_INF;
uchar* row_desc_ptr = (uchar *)row_desc;
uchar *new_key_ptr = (uchar *)new_key_data;
uchar *saved_key_ptr = (uchar *)saved_key_data;
uint32_t new_key_bytes_left = new_key_size;
uint32_t saved_key_bytes_left = saved_key_size;
//
// if the keys have an infinity byte, set it
//
if (row_desc_ptr[0]) {
new_key_inf_val = (int8_t)new_key_ptr[0];
saved_key_inf_val = (int8_t)saved_key_ptr[0];
new_key_ptr++;
saved_key_ptr++;
}
row_desc_ptr++;
while ((uint32_t)(new_key_ptr - (uchar*)new_key_data) < new_key_size &&
(uint32_t)(saved_key_ptr - (uchar*)saved_key_data) < saved_key_size &&
(uint32_t)(row_desc_ptr - (uchar*)row_desc) < row_desc_size) {
uint32_t new_key_field_length;
uint32_t saved_key_field_length;
uint32_t row_desc_field_length;
//
// if there is a null byte at this point in the key
//
if (row_desc_ptr[0]) {
//
// compare null bytes. If different, return
//
if (new_key_ptr[0] != saved_key_ptr[0]) {
ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr);
goto exit;
}
saved_key_ptr++;
//
// in case we just read the fact that new_key_ptr and saved_key_ptr
// have NULL as their next field
//
if (!*new_key_ptr++) {
//
// skip row_desc_ptr[0] read in if clause
//
row_desc_ptr++;
//
// skip data that describes rest of field
//
row_desc_ptr += skip_field_in_descriptor(row_desc_ptr);
continue;
}
}
row_desc_ptr++;
ret_val = compare_toku_field(
new_key_ptr,
saved_key_ptr,
row_desc_ptr,
&new_key_field_length,
&saved_key_field_length,
&row_desc_field_length,
read_string);
new_key_ptr += new_key_field_length;
saved_key_ptr += saved_key_field_length;
row_desc_ptr += row_desc_field_length;
if (ret_val) {
goto exit;
}
assert_always(
(uint32_t)(new_key_ptr - (uchar*)new_key_data) <= new_key_size);
assert_always(
(uint32_t)(saved_key_ptr - (uchar*)saved_key_data) <= saved_key_size);
assert_always(
(uint32_t)(row_desc_ptr - (uchar*)row_desc) <= row_desc_size);
}
new_key_bytes_left =
new_key_size - ((uint32_t)(new_key_ptr - (uchar*)new_key_data));
saved_key_bytes_left =
saved_key_size - ((uint32_t)(saved_key_ptr - (uchar*)saved_key_data));
if (cmp_prefix) {
ret_val = 0;
} else if (new_key_bytes_left== 0 && saved_key_bytes_left== 0) {
// in this case, read both keys to completion, now read infinity byte
ret_val = new_key_inf_val - saved_key_inf_val;
} else if (new_key_bytes_left == 0 && saved_key_bytes_left > 0) {
// at this point, one SHOULD be 0
ret_val = (new_key_inf_val == COL_POS_INF ) ? 1 : -1;
} else if (new_key_bytes_left > 0 && saved_key_bytes_left == 0) {
ret_val = (saved_key_inf_val == COL_POS_INF ) ? -1 : 1;
} else {
// this should never happen, perhaps we should assert(false)
assert_unreachable();
ret_val = new_key_bytes_left - saved_key_bytes_left;
}
exit:
return ret_val;
}
static int simple_memcmp(const DBT *keya, const DBT *keyb) {
int cmp;
int num_bytes_cmp = keya->size < keyb->size ?
keya->size : keyb->size;
cmp = memcmp(keya->data,keyb->data,num_bytes_cmp);
if (cmp == 0 && (keya->size != keyb->size)) {
cmp = keya->size < keyb->size ? -1 : 1;
}
return cmp;
}
// comparison function to be used by the fractal trees.
static int tokudb_cmp_dbt_key(DB* file, const DBT *keya, const DBT *keyb) {
int cmp;
if (file->cmp_descriptor->dbt.size == 0) {
cmp = simple_memcmp(keya, keyb);
}
else {
bool read_string = false;
cmp = tokudb_compare_two_keys(
keya->data,
keya->size,
keyb->data,
keyb->size,
(uchar *)file->cmp_descriptor->dbt.data + 4,
(*(uint32_t *)file->cmp_descriptor->dbt.data) - 4,
false,
&read_string
);
// comparison above may be case-insensitive, but fractal tree
// needs to distinguish between different data, so we do this
// additional check here
if (read_string && (cmp == 0)) {
cmp = simple_memcmp(keya, keyb);
}
}
return cmp;
}
//TODO: QQQ Only do one direction for prefix.
static int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb) {
// calls to this function are done by the handlerton, and are
// comparing just the keys as MySQL would compare them.
bool read_string = false;
int cmp = tokudb_compare_two_keys(
keya->data,
keya->size,
keyb->data,
keyb->size,
(uchar *)file->cmp_descriptor->dbt.data + 4,
*(uint32_t *)file->cmp_descriptor->dbt.data - 4,
true,
&read_string
);
return cmp;
}
static int tokudb_compare_two_key_parts(
const void* new_key_data,
const uint32_t new_key_size,
const void* saved_key_data,
const uint32_t saved_key_size,
const void* row_desc,
const uint32_t row_desc_size,
uint max_parts
)
{
int ret_val = 0;
uchar* row_desc_ptr = (uchar *)row_desc;
uchar *new_key_ptr = (uchar *)new_key_data;
uchar *saved_key_ptr = (uchar *)saved_key_data;
//
// if the keys have an infinity byte, set it
//
if (row_desc_ptr[0]) {
// new_key_inf_val = (int8_t)new_key_ptr[0];
// saved_key_inf_val = (int8_t)saved_key_ptr[0];
new_key_ptr++;
saved_key_ptr++;
}
row_desc_ptr++;
for (uint i = 0; i < max_parts; i++) {
if (!((uint32_t)(new_key_ptr - (uchar *)new_key_data) < new_key_size &&
(uint32_t)(saved_key_ptr - (uchar *)saved_key_data) < saved_key_size &&
(uint32_t)(row_desc_ptr - (uchar *)row_desc) < row_desc_size))
break;
uint32_t new_key_field_length;
uint32_t saved_key_field_length;
uint32_t row_desc_field_length;
//
// if there is a null byte at this point in the key
//
if (row_desc_ptr[0]) {
//
// compare null bytes. If different, return
//
if (new_key_ptr[0] != saved_key_ptr[0]) {
ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr);
goto exit;
}
saved_key_ptr++;
//
// in case we just read the fact that new_key_ptr and saved_key_ptr
// have NULL as their next field
//
if (!*new_key_ptr++) {
//
// skip row_desc_ptr[0] read in if clause
//
row_desc_ptr++;
//
// skip data that describes rest of field
//
row_desc_ptr += skip_field_in_descriptor(row_desc_ptr);
continue;
}
}
row_desc_ptr++;
bool read_string = false;
ret_val = compare_toku_field(
new_key_ptr,
saved_key_ptr,
row_desc_ptr,
&new_key_field_length,
&saved_key_field_length,
&row_desc_field_length,
&read_string
);
new_key_ptr += new_key_field_length;
saved_key_ptr += saved_key_field_length;
row_desc_ptr += row_desc_field_length;
if (ret_val) {
goto exit;
}
assert_always((uint32_t)(new_key_ptr - (uchar *)new_key_data) <= new_key_size);
assert_always((uint32_t)(saved_key_ptr - (uchar *)saved_key_data) <= saved_key_size);
assert_always((uint32_t)(row_desc_ptr - (uchar *)row_desc) <= row_desc_size);
}
ret_val = 0;
exit:
return ret_val;
}
static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts) {
assert_always(file->cmp_descriptor->dbt.size);
return tokudb_compare_two_key_parts(
keya->data,
keya->size,
keyb->data,
keyb->size,
(uchar *)file->cmp_descriptor->dbt.data + 4,
(*(uint32_t *)file->cmp_descriptor->dbt.data) - 4,
max_parts);
}
static uint32_t create_toku_main_key_pack_descriptor (
uchar* buf
)
{
//
// The first four bytes always contain the offset of where the first key
// ends.
//
uchar* pos = buf + 4;
uint32_t offset = 0;
//
// one byte states if this is the main dictionary
//
pos[0] = 1;
pos++;
goto exit;
exit:
offset = pos - buf;
buf[0] = (uchar)(offset & 255);
buf[1] = (uchar)((offset >> 8) & 255);
buf[2] = (uchar)((offset >> 16) & 255);
buf[3] = (uchar)((offset >> 24) & 255);
return pos - buf;
}
#define COL_HAS_NO_CHARSET 0x44
#define COL_HAS_CHARSET 0x55
#define COL_FIX_PK_OFFSET 0x66
#define COL_VAR_PK_OFFSET 0x77
#define CK_FIX_RANGE 0x88
#define CK_VAR_RANGE 0x99
#define COPY_OFFSET_TO_BUF memcpy ( \
pos, \
&kc_info->cp_info[pk_index][field_index].col_pack_val, \
sizeof(uint32_t) \
); \
pos += sizeof(uint32_t);
static uint32_t pack_desc_pk_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) {
uchar* pos = buf;
uint16 field_index = key_part->field->field_index;
Field* field = table_share->field[field_index];
TOKU_TYPE toku_type = mysql_to_toku_type(field);
uint32_t key_part_length = key_part->length;
uint32_t field_length;
uchar len_bytes = 0;
switch(toku_type) {
case (toku_type_int):
case (toku_type_double):
case (toku_type_float):
pos[0] = COL_FIX_FIELD;
pos++;
assert_always(kc_info->field_lengths[field_index] < 256);
pos[0] = kc_info->field_lengths[field_index];
pos++;
break;
case (toku_type_fixbinary):
pos[0] = COL_FIX_FIELD;
pos++;
field_length = field->pack_length();
set_if_smaller(key_part_length, field_length);
assert_always(key_part_length < 256);
pos[0] = (uchar)key_part_length;
pos++;
break;
case (toku_type_fixstring):
case (toku_type_varbinary):
case (toku_type_varstring):
case (toku_type_blob):
pos[0] = COL_VAR_FIELD;
pos++;
len_bytes = (key_part_length > 255) ? 2 : 1;
pos[0] = len_bytes;
pos++;
break;
default:
assert_unreachable();
}
return pos - buf;
}
static uint32_t pack_desc_pk_offset_info(uchar* buf,
KEY_PART_INFO* key_part,
KEY* prim_key,
uchar* pk_info) {
uchar* pos = buf;
uint16 field_index = key_part->field->field_index;
bool found_col_in_pk = false;
uint32_t index_in_pk;
bool is_constant_offset = true;
uint32_t offset = 0;
for (uint i = 0; i < prim_key->user_defined_key_parts; i++) {
KEY_PART_INFO curr = prim_key->key_part[i];
uint16 curr_field_index = curr.field->field_index;
if (pk_info[2*i] == COL_VAR_FIELD) {
is_constant_offset = false;
}
if (curr_field_index == field_index) {
found_col_in_pk = true;
index_in_pk = i;
break;
}
offset += pk_info[2*i + 1];
}
assert_always(found_col_in_pk);
if (is_constant_offset) {
pos[0] = COL_FIX_PK_OFFSET;
pos++;
memcpy (pos, &offset, sizeof(offset));
pos += sizeof(offset);
}
else {
pos[0] = COL_VAR_PK_OFFSET;
pos++;
memcpy(pos, &index_in_pk, sizeof(index_in_pk));
pos += sizeof(index_in_pk);
}
return pos - buf;
}
static uint32_t pack_desc_offset_info(uchar* buf, KEY_AND_COL_INFO* kc_info, uint pk_index, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) {
uchar* pos = buf;
uint16 field_index = key_part->field->field_index;
Field* field = table_share->field[field_index];
TOKU_TYPE toku_type = mysql_to_toku_type(field);
bool found_index = false;
switch(toku_type) {
case (toku_type_int):
case (toku_type_double):
case (toku_type_float):
case (toku_type_fixbinary):
case (toku_type_fixstring):
pos[0] = COL_FIX_FIELD;
pos++;
// copy the offset
COPY_OFFSET_TO_BUF;
break;
case (toku_type_varbinary):
case (toku_type_varstring):
pos[0] = COL_VAR_FIELD;
pos++;
// copy the offset
COPY_OFFSET_TO_BUF;
break;
case (toku_type_blob):
pos[0] = COL_BLOB_FIELD;
pos++;
for (uint32_t i = 0; i < kc_info->num_blobs; i++) {
uint32_t blob_index = kc_info->blob_fields[i];
if (blob_index == field_index) {
uint32_t val = i;
memcpy(pos, &val, sizeof(uint32_t));
pos += sizeof(uint32_t);
found_index = true;
break;
}
}
assert_always(found_index);
break;
default:
assert_unreachable();
}
return pos - buf;
}
static uint32_t pack_desc_key_length_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) {
uchar* pos = buf;
uint16 field_index = key_part->field->field_index;
Field* field = table_share->field[field_index];
TOKU_TYPE toku_type = mysql_to_toku_type(field);
uint32_t key_part_length = key_part->length;
uint32_t field_length;
switch(toku_type) {
case (toku_type_int):
case (toku_type_double):
case (toku_type_float):
// copy the key_part length
field_length = kc_info->field_lengths[field_index];
memcpy(pos, &field_length, sizeof(field_length));
pos += sizeof(key_part_length);
break;
case (toku_type_fixbinary):
case (toku_type_fixstring):
field_length = field->pack_length();
set_if_smaller(key_part_length, field_length);
// fallthrough
case (toku_type_varbinary):
case (toku_type_varstring):
case (toku_type_blob):
// copy the key_part length
memcpy(pos, &key_part_length, sizeof(key_part_length));
pos += sizeof(key_part_length);
break;
default:
assert_unreachable();
}
return pos - buf;
}
static uint32_t pack_desc_char_info(uchar* buf,
TABLE_SHARE* table_share,
KEY_PART_INFO* key_part) {
uchar* pos = buf;
uint16 field_index = key_part->field->field_index;
Field* field = table_share->field[field_index];
TOKU_TYPE toku_type = mysql_to_toku_type(field);
uint32_t charset_num = 0;
switch(toku_type) {
case (toku_type_int):
case (toku_type_double):
case (toku_type_float):
case (toku_type_fixbinary):
case (toku_type_varbinary):
pos[0] = COL_HAS_NO_CHARSET;
pos++;
break;
case (toku_type_fixstring):
case (toku_type_varstring):
case (toku_type_blob):
pos[0] = COL_HAS_CHARSET;
pos++;
// copy the charset
charset_num = field->charset()->number;
pos[0] = (uchar)(charset_num & 255);
pos[1] = (uchar)((charset_num >> 8) & 255);
pos[2] = (uchar)((charset_num >> 16) & 255);
pos[3] = (uchar)((charset_num >> 24) & 255);
pos += 4;
break;
default:
assert_unreachable();
}
return pos - buf;
}
static uint32_t pack_some_row_info (
uchar* buf,
uint pk_index,
TABLE_SHARE* table_share,
KEY_AND_COL_INFO* kc_info
)
{
uchar* pos = buf;
uint32_t num_null_bytes = 0;
//
// four bytes stating number of null bytes
//
num_null_bytes = table_share->null_bytes;
memcpy(pos, &num_null_bytes, sizeof(num_null_bytes));
pos += sizeof(num_null_bytes);
//
// eight bytes stating mcp_info
//
memcpy(pos, &kc_info->mcp_info[pk_index], sizeof(MULTI_COL_PACK_INFO));
pos += sizeof(MULTI_COL_PACK_INFO);
//
// one byte for the number of offset bytes
//
pos[0] = (uchar)kc_info->num_offset_bytes;
pos++;
return pos - buf;
}
static uint32_t get_max_clustering_val_pack_desc_size(
TABLE_SHARE* table_share
)
{
uint32_t ret_val = 0;
//
// the fixed stuff:
// first the things in pack_some_row_info
// second another mcp_info
// third a byte that states if blobs exist
ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1;
ret_val += sizeof(MULTI_COL_PACK_INFO);
ret_val++;
//
// now the variable stuff
// an upper bound is, for each field, byte stating if it is fixed or var, followed
// by 8 bytes for endpoints
//
ret_val += (table_share->fields)*(1 + 2*sizeof(uint32_t));
//
// four bytes storing the length of this portion
//
ret_val += 4;
return ret_val;
}
static uint32_t create_toku_clustering_val_pack_descriptor (
uchar* buf,
uint pk_index,
TABLE_SHARE* table_share,
KEY_AND_COL_INFO* kc_info,
uint32_t keynr,
bool is_clustering
)
{
uchar* pos = buf + 4;
uint32_t offset = 0;
bool start_range_set = false;
uint32_t last_col = 0;
//
// do not need to write anything if the key is not clustering
//
if (!is_clustering) {
goto exit;
}
pos += pack_some_row_info(
pos,
pk_index,
table_share,
kc_info
);
//
// eight bytes stating mcp_info of clustering key
//
memcpy(pos, &kc_info->mcp_info[keynr], sizeof(MULTI_COL_PACK_INFO));
pos += sizeof(MULTI_COL_PACK_INFO);
//
// store bit that states if blobs exist
//
pos[0] = (kc_info->num_blobs) ? 1 : 0;
pos++;
//
// descriptor assumes that all fields filtered from pk are
// also filtered from clustering key val. Doing check here to
// make sure something unexpected does not happen
//
for (uint i = 0; i < table_share->fields; i++) {
bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i);
bool col_filtered_in_pk = bitmap_is_set(&kc_info->key_filters[pk_index],i);
if (col_filtered_in_pk) {
assert_always(col_filtered);
}
}
//
// first handle the fixed fields
//
start_range_set = false;
last_col = 0;
for (uint i = 0; i < table_share->fields; i++) {
bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i);
if (!is_fixed_field(kc_info, i)) {
//
// not a fixed field, continue
//
continue;
}
if (col_filtered && start_range_set) {
//
// need to set the end range
//
start_range_set = false;
uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val + kc_info->field_lengths[last_col];
memcpy(pos, &end_offset, sizeof(end_offset));
pos += sizeof(end_offset);
}
else if (!col_filtered) {
if (!start_range_set) {
pos[0] = CK_FIX_RANGE;
pos++;
start_range_set = true;
uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val;
memcpy(pos, &start_offset , sizeof(start_offset));
pos += sizeof(start_offset);
}
last_col = i;
}
else {
continue;
}
}
if (start_range_set) {
//
// need to set the end range
//
start_range_set = false;
uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val+ kc_info->field_lengths[last_col];
memcpy(pos, &end_offset, sizeof(end_offset));
pos += sizeof(end_offset);
}
//
// now handle the var fields
//
start_range_set = false;
last_col = 0;
for (uint i = 0; i < table_share->fields; i++) {
bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i);
if (!is_variable_field(kc_info, i)) {
//
// not a var field, continue
//
continue;
}
if (col_filtered && start_range_set) {
//
// need to set the end range
//
start_range_set = false;
uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val;
memcpy(pos, &end_offset, sizeof(end_offset));
pos += sizeof(end_offset);
}
else if (!col_filtered) {
if (!start_range_set) {
pos[0] = CK_VAR_RANGE;
pos++;
start_range_set = true;
uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val;
memcpy(pos, &start_offset , sizeof(start_offset));
pos += sizeof(start_offset);
}
last_col = i;
}
else {
continue;
}
}
if (start_range_set) {
start_range_set = false;
uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val;
memcpy(pos, &end_offset, sizeof(end_offset));
pos += sizeof(end_offset);
}
exit:
offset = pos - buf;
buf[0] = (uchar)(offset & 255);
buf[1] = (uchar)((offset >> 8) & 255);
buf[2] = (uchar)((offset >> 16) & 255);
buf[3] = (uchar)((offset >> 24) & 255);
return pos - buf;
}
static uint32_t pack_clustering_val_from_desc(
uchar* buf,
void* row_desc,
uint32_t row_desc_size,
const DBT* pk_val
)
{
uchar* null_bytes_src_ptr = NULL;
uchar* fixed_src_ptr = NULL;
uchar* var_src_offset_ptr = NULL;
uchar* var_src_data_ptr = NULL;
uchar* fixed_dest_ptr = NULL;
uchar* var_dest_offset_ptr = NULL;
uchar* var_dest_data_ptr = NULL;
uchar* orig_var_dest_data_ptr = NULL;
uchar* desc_pos = (uchar *)row_desc;
uint32_t num_null_bytes = 0;
uint32_t num_offset_bytes;
MULTI_COL_PACK_INFO src_mcp_info, dest_mcp_info;
uchar has_blobs;
memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes));
desc_pos += sizeof(num_null_bytes);
memcpy(&src_mcp_info, desc_pos, sizeof(src_mcp_info));
desc_pos += sizeof(src_mcp_info);
num_offset_bytes = desc_pos[0];
desc_pos++;
memcpy(&dest_mcp_info, desc_pos, sizeof(dest_mcp_info));
desc_pos += sizeof(dest_mcp_info);
has_blobs = desc_pos[0];
desc_pos++;
//
//set the variables
//
null_bytes_src_ptr = (uchar *)pk_val->data;
fixed_src_ptr = null_bytes_src_ptr + num_null_bytes;
var_src_offset_ptr = fixed_src_ptr + src_mcp_info.fixed_field_size;
var_src_data_ptr = var_src_offset_ptr + src_mcp_info.len_of_offsets;
fixed_dest_ptr = buf + num_null_bytes;
var_dest_offset_ptr = fixed_dest_ptr + dest_mcp_info.fixed_field_size;
var_dest_data_ptr = var_dest_offset_ptr + dest_mcp_info.len_of_offsets;
orig_var_dest_data_ptr = var_dest_data_ptr;
//
// copy the null bytes
//
memcpy(buf, null_bytes_src_ptr, num_null_bytes);
while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) {
uint32_t start, end, length;
uchar curr = desc_pos[0];
desc_pos++;
memcpy(&start, desc_pos, sizeof(start));
desc_pos += sizeof(start);
memcpy(&end, desc_pos, sizeof(end));
desc_pos += sizeof(end);
assert_always (start <= end);
if (curr == CK_FIX_RANGE) {
length = end - start;
memcpy(fixed_dest_ptr, fixed_src_ptr + start, length);
fixed_dest_ptr += length;
}
else if (curr == CK_VAR_RANGE) {
uint32_t start_data_size;
uint32_t start_data_offset;
uint32_t end_data_size;
uint32_t end_data_offset;
uint32_t offset_diffs;
get_var_field_info(
&start_data_size,
&start_data_offset,
start,
var_src_offset_ptr,
num_offset_bytes
);
get_var_field_info(
&end_data_size,
&end_data_offset,
end,
var_src_offset_ptr,
num_offset_bytes
);
length = end_data_offset + end_data_size - start_data_offset;
//
// copy the data
//
memcpy(
var_dest_data_ptr,
var_src_data_ptr + start_data_offset,
length
);
var_dest_data_ptr += length;
//
// put in offset info
//
offset_diffs = (end_data_offset + end_data_size) - (uint32_t)(var_dest_data_ptr - orig_var_dest_data_ptr);
for (uint32_t i = start; i <= end; i++) {
if ( num_offset_bytes == 1 ) {
assert_always(offset_diffs < 256);
var_dest_offset_ptr[0] = var_src_offset_ptr[i] - (uchar)offset_diffs;
var_dest_offset_ptr++;
} else if ( num_offset_bytes == 2 ) {
uint32_t tmp = uint2korr(var_src_offset_ptr + 2*i);
uint32_t new_offset = tmp - offset_diffs;
assert_always(new_offset < 1<<16);
int2store(var_dest_offset_ptr,new_offset);
var_dest_offset_ptr += 2;
} else {
assert_unreachable();
}
}
} else {
assert_unreachable();
}
}
//
// copy blobs
// at this point, var_dest_data_ptr is pointing to the end, where blobs should be located
// so, we put the blobs at var_dest_data_ptr
//
if (has_blobs) {
uint32_t num_blob_bytes;
uint32_t start_offset;
uchar* src_blob_ptr = NULL;
get_blob_field_info(
&start_offset,
src_mcp_info.len_of_offsets,
var_src_data_ptr,
num_offset_bytes
);
src_blob_ptr = var_src_data_ptr + start_offset;
num_blob_bytes = pk_val->size - (start_offset + (var_src_data_ptr - null_bytes_src_ptr));
memcpy(var_dest_data_ptr, src_blob_ptr, num_blob_bytes);
var_dest_data_ptr += num_blob_bytes;
}
return var_dest_data_ptr - buf;
}
static uint32_t get_max_secondary_key_pack_desc_size(
KEY_AND_COL_INFO* kc_info
)
{
uint32_t ret_val = 0;
//
// the fixed stuff:
// byte that states if main dictionary
// byte that states if hpk
// the things in pack_some_row_info
ret_val++;
ret_val++;
ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1;
//
// now variable sized stuff
//
// first the blobs
ret_val += sizeof(kc_info->num_blobs);
ret_val+= kc_info->num_blobs;
// then the pk
// one byte for num key parts
// two bytes for each key part
ret_val++;
ret_val += MAX_REF_PARTS*2;
// then the key
// null bit, then null byte,
// then 1 byte stating what it is, then 4 for offset, 4 for key length,
// 1 for if charset exists, and 4 for charset
ret_val += MAX_REF_PARTS*(1 + sizeof(uint32_t) + 1 + 3*sizeof(uint32_t) + 1);
//
// four bytes storing the length of this portion
//
ret_val += 4;
return ret_val;
}
static uint32_t create_toku_secondary_key_pack_descriptor (
uchar* buf,
bool has_hpk,
uint pk_index,
TABLE_SHARE* table_share,
TABLE* table,
KEY_AND_COL_INFO* kc_info,
KEY* key_info,
KEY* prim_key
)
{
//
// The first four bytes always contain the offset of where the first key
// ends.
//
uchar* pk_info = NULL;
uchar* pos = buf + 4;
uint32_t offset = 0;
//
// first byte states that it is NOT main dictionary
//
pos[0] = 0;
pos++;
//
// one byte states if main dictionary has an hpk or not
//
if (has_hpk) {
pos[0] = 1;
}
else {
pos[0] = 0;
}
pos++;
pos += pack_some_row_info(
pos,
pk_index,
table_share,
kc_info
);
//
// store blob information
//
memcpy(pos, &kc_info->num_blobs, sizeof(kc_info->num_blobs));
pos += sizeof(uint32_t);
for (uint32_t i = 0; i < kc_info->num_blobs; i++) {
//
// store length bytes for each blob
//
Field* field = table_share->field[kc_info->blob_fields[i]];
pos[0] = (uchar)field->row_pack_length();
pos++;
}
//
// store the pk information
//
if (has_hpk) {
pos[0] = 0;
pos++;
}
else {
//
// store number of parts
//
assert_always(prim_key->user_defined_key_parts < 128);
pos[0] = 2 * prim_key->user_defined_key_parts;
pos++;
//
// for each part, store if it is a fixed field or var field
// if fixed, store number of bytes, if var, store
// number of length bytes
// total should be two bytes per key part stored
//
pk_info = pos;
uchar* tmp = pos;
for (uint i = 0; i < prim_key->user_defined_key_parts; i++) {
tmp += pack_desc_pk_info(
tmp,
kc_info,
table_share,
&prim_key->key_part[i]
);
}
//
// asserting that we moved forward as much as we think we have
//
assert_always(tmp - pos == (2 * prim_key->user_defined_key_parts));
pos = tmp;
}
for (uint i = 0; i < key_info->user_defined_key_parts; i++) {
KEY_PART_INFO curr_kpi = key_info->key_part[i];
uint16 field_index = curr_kpi.field->field_index;
Field* field = table_share->field[field_index];
bool is_col_in_pk = false;
if (bitmap_is_set(&kc_info->key_filters[pk_index],field_index)) {
assert_always(!has_hpk && prim_key != NULL);
is_col_in_pk = true;
}
else {
is_col_in_pk = false;
}
pos[0] = field->null_bit;
pos++;
if (is_col_in_pk) {
//
// assert that columns in pk do not have a null bit
// because in MySQL, pk columns cannot be null
//
assert_always(!field->null_bit);
}
if (field->null_bit) {
uint32_t null_offset = get_null_offset(table,table->field[field_index]);
memcpy(pos, &null_offset, sizeof(uint32_t));
pos += sizeof(uint32_t);
}
if (is_col_in_pk) {
pos += pack_desc_pk_offset_info(pos, &curr_kpi, prim_key, pk_info);
}
else {
pos += pack_desc_offset_info(
pos,
kc_info,
pk_index,
table_share,
&curr_kpi
);
}
pos += pack_desc_key_length_info(
pos,
kc_info,
table_share,
&curr_kpi
);
pos += pack_desc_char_info(pos, table_share, &curr_kpi);
}
offset = pos - buf;
buf[0] = (uchar)(offset & 255);
buf[1] = (uchar)((offset >> 8) & 255);
buf[2] = (uchar)((offset >> 16) & 255);
buf[3] = (uchar)((offset >> 24) & 255);
return pos - buf;
}
static uint32_t skip_key_in_desc(
uchar* row_desc
)
{
uchar* pos = row_desc;
uchar col_bin_or_char;
//
// skip the byte that states if it is a fix field or var field, we do not care
//
pos++;
//
// skip the offset information
//
pos += sizeof(uint32_t);
//
// skip the key_part_length info
//
pos += sizeof(uint32_t);
col_bin_or_char = pos[0];
pos++;
if (col_bin_or_char == COL_HAS_NO_CHARSET) {
goto exit;
}
//
// skip the charset info
//
pos += 4;
exit:
return (uint32_t)(pos-row_desc);
}
static uint32_t max_key_size_from_desc(
void* row_desc,
uint32_t row_desc_size
)
{
uchar* desc_pos = (uchar *)row_desc;
uint32_t num_blobs;
uint32_t num_pk_columns;
//
// start at 1 for the infinity byte
//
uint32_t max_size = 1;
// skip byte that states if main dictionary
bool is_main_dictionary = desc_pos[0];
desc_pos++;
assert_always(!is_main_dictionary);
// skip hpk byte
desc_pos++;
// skip num_null_bytes
desc_pos += sizeof(uint32_t);
// skip mcp_info
desc_pos += sizeof(MULTI_COL_PACK_INFO);
// skip offset_bytes
desc_pos++;
// skip over blobs
memcpy(&num_blobs, desc_pos, sizeof(num_blobs));
desc_pos += sizeof(num_blobs);
desc_pos += num_blobs;
// skip over pk info
num_pk_columns = desc_pos[0]/2;
desc_pos++;
desc_pos += 2*num_pk_columns;
while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) {
uchar has_charset;
uint32_t key_length = 0;
uchar null_bit = desc_pos[0];
desc_pos++;
if (null_bit) {
//
// column is NULLable, skip null_offset, and add a null byte
//
max_size++;
desc_pos += sizeof(uint32_t);
}
//
// skip over byte that states if fix or var
//
desc_pos++;
// skip over offset
desc_pos += sizeof(uint32_t);
//
// get the key length and add it to return value
//
memcpy(&key_length, desc_pos, sizeof(key_length));
desc_pos += sizeof(key_length);
max_size += key_length;
max_size += 2; // 2 bytes for a potential length bytes, we are upperbounding, does not need to be super tight
has_charset = desc_pos[0];
desc_pos++;
uint32_t charset_num;
if (has_charset == COL_HAS_CHARSET) {
// skip over charsent num
desc_pos += sizeof(charset_num);
}
else {
assert_always(has_charset == COL_HAS_NO_CHARSET);
}
}
return max_size;
}
static uint32_t pack_key_from_desc(
uchar* buf,
void* row_desc,
uint32_t row_desc_size,
const DBT* pk_key,
const DBT* pk_val) {
MULTI_COL_PACK_INFO mcp_info;
uint32_t num_null_bytes;
uint32_t num_blobs;
uint32_t num_pk_columns;
uchar* blob_lengths = NULL;
uchar* pk_info = NULL;
uchar* pk_data_ptr = NULL;
uchar* null_bytes_ptr = NULL;
uchar* fixed_field_ptr = NULL;
uchar* var_field_offset_ptr = NULL;
const uchar* var_field_data_ptr = NULL;
uint32_t num_offset_bytes;
uchar* packed_key_pos = buf;
uchar* desc_pos = (uchar *)row_desc;
bool is_main_dictionary = desc_pos[0];
desc_pos++;
assert_always(!is_main_dictionary);
//
// get the constant info out of descriptor
//
bool hpk = desc_pos[0];
desc_pos++;
memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes));
desc_pos += sizeof(num_null_bytes);
memcpy(&mcp_info, desc_pos, sizeof(mcp_info));
desc_pos += sizeof(mcp_info);
num_offset_bytes = desc_pos[0];
desc_pos++;
memcpy(&num_blobs, desc_pos, sizeof(num_blobs));
desc_pos += sizeof(num_blobs);
blob_lengths = desc_pos;
desc_pos += num_blobs;
num_pk_columns = desc_pos[0]/2;
desc_pos++;
pk_info = desc_pos;
desc_pos += 2*num_pk_columns;
//
// now start packing the key
//
//
// pack the infinity byte
//
packed_key_pos[0] = COL_ZERO;
packed_key_pos++;
//
// now start packing each column of the key, as described in descriptor
//
if (!hpk) {
// +1 for the infinity byte
pk_data_ptr = (uchar *)pk_key->data + 1;
}
null_bytes_ptr = (uchar *)pk_val->data;
fixed_field_ptr = null_bytes_ptr + num_null_bytes;
var_field_offset_ptr = fixed_field_ptr + mcp_info.fixed_field_size;
var_field_data_ptr = var_field_offset_ptr + mcp_info.len_of_offsets;
while ((uint32_t)(desc_pos - (uchar*)row_desc) < row_desc_size) {
uchar col_fix_val;
uchar has_charset;
uint32_t col_pack_val = 0;
uint32_t key_length = 0;
uchar null_bit = desc_pos[0];
desc_pos++;
if (null_bit) {
//
// column is NULLable, need to check the null bytes to see if it is NULL
//
uint32_t null_offset = 0;
bool is_field_null;
memcpy(&null_offset, desc_pos, sizeof(null_offset));
desc_pos += sizeof(null_offset);
is_field_null = (null_bytes_ptr[null_offset] & null_bit) ? true: false;
if (is_field_null) {
packed_key_pos[0] = NULL_COL_VAL;
packed_key_pos++;
desc_pos += skip_key_in_desc(desc_pos);
continue;
} else {
packed_key_pos[0] = NONNULL_COL_VAL;
packed_key_pos++;
}
}
//
// now pack the column (unless it was NULL, and we continued)
//
col_fix_val = desc_pos[0];
desc_pos++;
memcpy(&col_pack_val, desc_pos, sizeof(col_pack_val));
desc_pos += sizeof(col_pack_val);
memcpy(&key_length, desc_pos, sizeof(key_length));
desc_pos += sizeof(key_length);
has_charset = desc_pos[0];
desc_pos++;
uint32_t charset_num = 0;
if (has_charset == COL_HAS_CHARSET) {
memcpy(&charset_num, desc_pos, sizeof(charset_num));
desc_pos += sizeof(charset_num);
} else {
assert_always(has_charset == COL_HAS_NO_CHARSET);
}
//
// case where column is in pk val
//
if (col_fix_val == COL_FIX_FIELD ||
col_fix_val == COL_VAR_FIELD ||
col_fix_val == COL_BLOB_FIELD) {
if (col_fix_val == COL_FIX_FIELD &&
has_charset == COL_HAS_NO_CHARSET) {
memcpy(
packed_key_pos,
&fixed_field_ptr[col_pack_val],
key_length);
packed_key_pos += key_length;
} else if (col_fix_val == COL_VAR_FIELD &&
has_charset == COL_HAS_NO_CHARSET) {
uint32_t data_start_offset = 0;
uint32_t data_size = 0;
get_var_field_info(
&data_size,
&data_start_offset,
col_pack_val,
var_field_offset_ptr,
num_offset_bytes);
//
// length of this field in this row is data_size
// data is located beginning at var_field_data_ptr + data_start_offset
//
packed_key_pos = pack_toku_varbinary_from_desc(
packed_key_pos,
var_field_data_ptr + data_start_offset,
//number of bytes to use to encode the length in to_tokudb
key_length,
//length of field
data_size);
} else {
const uchar* data_start = NULL;
uint32_t data_start_offset = 0;
uint32_t data_size = 0;
if (col_fix_val == COL_FIX_FIELD) {
data_start_offset = col_pack_val;
data_size = key_length;
data_start = fixed_field_ptr + data_start_offset;
} else if (col_fix_val == COL_VAR_FIELD){
get_var_field_info(
&data_size,
&data_start_offset,
col_pack_val,
var_field_offset_ptr,
num_offset_bytes);
data_start = var_field_data_ptr + data_start_offset;
} else if (col_fix_val == COL_BLOB_FIELD) {
uint32_t blob_index = col_pack_val;
uint32_t blob_offset;
const uchar* blob_ptr = NULL;
uint32_t field_len;
uint32_t field_len_bytes = blob_lengths[blob_index];
get_blob_field_info(
&blob_offset,
mcp_info.len_of_offsets,
var_field_data_ptr,
num_offset_bytes);
blob_ptr = var_field_data_ptr + blob_offset;
assert_always(num_blobs > 0);
// skip over other blobs to get to the one we want to
// make a key out of
for (uint32_t i = 0; i < blob_index; i++) {
blob_ptr = unpack_toku_field_blob(
NULL,
blob_ptr,
blob_lengths[i],
true);
}
// at this point, blob_ptr is pointing to the blob we
// want to make a key from
field_len = get_blob_field_len(blob_ptr, field_len_bytes);
// now we set the variables to make the key
data_start = blob_ptr + field_len_bytes;
data_size = field_len;
} else {
assert_unreachable();
}
packed_key_pos = pack_toku_varstring_from_desc(packed_key_pos,
data_start,
key_length,
data_size,
charset_num);
}
} else {
// case where column is in pk key
if (col_fix_val == COL_FIX_PK_OFFSET) {
memcpy(packed_key_pos, &pk_data_ptr[col_pack_val], key_length);
packed_key_pos += key_length;
} else if (col_fix_val == COL_VAR_PK_OFFSET) {
uchar* tmp_pk_data_ptr = pk_data_ptr;
uint32_t index_in_pk = col_pack_val;
//
// skip along in pk to the right column
//
for (uint32_t i = 0; i < index_in_pk; i++) {
if (pk_info[2*i] == COL_FIX_FIELD) {
tmp_pk_data_ptr += pk_info[2*i + 1];
} else if (pk_info[2*i] == COL_VAR_FIELD) {
uint32_t len_bytes = pk_info[2*i + 1];
uint32_t len;
if (len_bytes == 1) {
len = tmp_pk_data_ptr[0];
tmp_pk_data_ptr++;
} else if (len_bytes == 2) {
len = uint2korr(tmp_pk_data_ptr);
tmp_pk_data_ptr += 2;
} else {
assert_unreachable();
}
tmp_pk_data_ptr += len;
} else {
assert_unreachable();
}
}
//
// at this point, tmp_pk_data_ptr is pointing at the column
//
uint32_t is_fix_field = pk_info[2*index_in_pk];
if (is_fix_field == COL_FIX_FIELD) {
memcpy(packed_key_pos, tmp_pk_data_ptr, key_length);
packed_key_pos += key_length;
} else if (is_fix_field == COL_VAR_FIELD) {
const uchar* data_start = NULL;
uint32_t data_size = 0;
uint32_t len_bytes = pk_info[2*index_in_pk + 1];
if (len_bytes == 1) {
data_size = tmp_pk_data_ptr[0];
tmp_pk_data_ptr++;
} else if (len_bytes == 2) {
data_size = uint2korr(tmp_pk_data_ptr);
tmp_pk_data_ptr += 2;
} else {
assert_unreachable();
}
data_start = tmp_pk_data_ptr;
if (has_charset == COL_HAS_CHARSET) {
packed_key_pos = pack_toku_varstring_from_desc(
packed_key_pos,
data_start,
key_length,
data_size,
charset_num);
} else if (has_charset == COL_HAS_NO_CHARSET) {
packed_key_pos = pack_toku_varbinary_from_desc(
packed_key_pos,
data_start,
key_length,
data_size);
} else {
assert_unreachable();
}
} else {
assert_unreachable();
}
} else {
assert_unreachable();
}
}
}
assert_always( (uint32_t)(desc_pos - (uchar *)row_desc) == row_desc_size);
//
// now append the primary key to the end of the key
//
if (hpk) {
memcpy(packed_key_pos, pk_key->data, pk_key->size);
packed_key_pos += pk_key->size;
} else {
memcpy(packed_key_pos, (uchar *)pk_key->data + 1, pk_key->size - 1);
packed_key_pos += (pk_key->size - 1);
}
return (uint32_t)(packed_key_pos - buf);
}
static bool fields_have_same_name(Field* a, Field* b) {
return strcmp(a->field_name, b->field_name) == 0;
}
static bool fields_are_same_type(Field* a, Field* b) {
bool retval = true;
enum_field_types a_mysql_type = a->real_type();
enum_field_types b_mysql_type = b->real_type();
TOKU_TYPE a_toku_type = mysql_to_toku_type(a);
TOKU_TYPE b_toku_type = mysql_to_toku_type(b);
// make sure have same names
// make sure have same types
if (a_mysql_type != b_mysql_type) {
retval = false;
goto cleanup;
}
// Thanks to MariaDB 5.5, we can have two fields
// be the same MySQL type but not the same toku type,
// This is an issue introduced with MariaDB's fractional time
// implementation
if (a_toku_type != b_toku_type) {
retval = false;
goto cleanup;
}
// make sure that either both are nullable, or both not nullable
if ((a->null_bit && !b->null_bit) || (!a->null_bit && b->null_bit)) {
retval = false;
goto cleanup;
}
switch (a_mysql_type) {
case MYSQL_TYPE_TINY:
case MYSQL_TYPE_SHORT:
case MYSQL_TYPE_INT24:
case MYSQL_TYPE_LONG:
case MYSQL_TYPE_LONGLONG:
// length, unsigned, auto increment
if (a->pack_length() != b->pack_length() ||
(a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) ||
(a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_DOUBLE:
case MYSQL_TYPE_FLOAT:
// length, unsigned, auto increment
if (a->pack_length() != b->pack_length() ||
(a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) ||
(a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_NEWDECIMAL:
// length, unsigned
if (a->pack_length() != b->pack_length() ||
(a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG)) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_ENUM: {
Field_enum *a_enum = static_cast(a);
if (!a_enum->eq_def(b)) {
retval = false;
goto cleanup;
}
break;
}
case MYSQL_TYPE_SET: {
Field_set *a_set = static_cast(a);
if (!a_set->eq_def(b)) {
retval = false;
goto cleanup;
}
break;
}
case MYSQL_TYPE_BIT:
// length
if (a->pack_length() != b->pack_length()) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_DATE:
case MYSQL_TYPE_DATETIME:
case MYSQL_TYPE_YEAR:
case MYSQL_TYPE_NEWDATE:
case MYSQL_TYPE_TIME:
case MYSQL_TYPE_TIMESTAMP:
#if (50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
(50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) || \
(100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100199)
case MYSQL_TYPE_DATETIME2:
case MYSQL_TYPE_TIMESTAMP2:
case MYSQL_TYPE_TIME2:
#endif
// length
if (a->pack_length() != b->pack_length()) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_TINY_BLOB:
case MYSQL_TYPE_MEDIUM_BLOB:
case MYSQL_TYPE_BLOB:
case MYSQL_TYPE_LONG_BLOB:
// test the charset
if (a->charset()->number != b->charset()->number) {
retval = false;
goto cleanup;
}
if (a->row_pack_length() != b->row_pack_length()) {
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_STRING:
if (a->pack_length() != b->pack_length()) {
retval = false;
goto cleanup;
}
// if both are binary, we know have same pack lengths,
// so we can goto end
if (a->binary() && b->binary()) {
// nothing to do, we are good
}
else if (!a->binary() && !b->binary()) {
// test the charset
if (a->charset()->number != b->charset()->number) {
retval = false;
goto cleanup;
}
}
else {
// one is binary and the other is not, so not the same
retval = false;
goto cleanup;
}
break;
case MYSQL_TYPE_VARCHAR:
if (a->field_length != b->field_length) {
retval = false;
goto cleanup;
}
// if both are binary, we know have same pack lengths,
// so we can goto end
if (a->binary() && b->binary()) {
// nothing to do, we are good
}
else if (!a->binary() && !b->binary()) {
// test the charset
if (a->charset()->number != b->charset()->number) {
retval = false;
goto cleanup;
}
}
else {
// one is binary and the other is not, so not the same
retval = false;
goto cleanup;
}
break;
//
// I believe these are old types that are no longer
// in any 5.1 tables, so tokudb does not need
// to worry about them
// Putting in this assert in case I am wrong.
// Do not support geometry yet.
//
case MYSQL_TYPE_GEOMETRY:
case MYSQL_TYPE_DECIMAL:
case MYSQL_TYPE_VAR_STRING:
case MYSQL_TYPE_NULL:
assert_unreachable();
}
cleanup:
return retval;
}
static bool are_two_fields_same(Field* a, Field* b) {
return fields_have_same_name(a, b) && fields_are_same_type(a, b);
}