summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
Diffstat (limited to 'sql')
-rw-r--r--sql/sql_load.cc189
-rw-r--r--sql/sql_string.h1
2 files changed, 145 insertions, 45 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index af4b25185d0..51a284964e1 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,81 @@ class READ_INFO {
NET *io_net;
int level; /* for load xml */
+
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2.
+#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
+#else
+ int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
+ {
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+ }
+
+ /**
+ Read a tail of a multi-byte character.
+ The first byte of the character is assumed to be already
+ read from the file and appended to "str".
+
+ @returns true - if EOF happened unexpectedly
+ @returns false - no EOF happened: found a good multi-byte character,
+ or a bad byte sequence
+
+ Note:
+ The return value depends only on EOF:
+ - read_mbtail() returns "false" is a good character was read, but also
+ - read_mbtail() returns "false" if an incomplete byte sequence was found
+ and no EOF happened.
+
+ For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+ - 0x8FA1 is an incomplete prefix of a 3-byte character
+ (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+ - 0x0A is a line demiliter
+ This file has some broken data, the trailing [A1-FE] is missing.
+
+ In this example it works as follows:
+ - 0x8F is read from the file and put into "data" before the call
+ for read_mbtail()
+ - 0xA1 is read from the file and put into "data" by read_mbtail()
+ - 0x0A is kept in the read queue, so the next read iteration after
+ the current read_mbtail() call will normally find it and recognize as
+ a line delimiter
+ - the current call for read_mbtail() returns "false",
+ because no EOF happened
+ */
+ bool read_mbtail(String *str)
+ {
+ int chlen;
+ if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
+ return false; // Single byte character found
+ for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+ {
+ int chr= GET;
+ if (chr == my_b_EOF)
+ {
+ DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
+ return true; // EOF
+ }
+ str->append(chr);
+ chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
+ if (chlen == MY_CS_ILSEQ)
+ {
+ /**
+ It has been an incomplete (but a valid) sequence so far,
+ but the last byte turned it into a bad byte sequence.
+ Unget the very last byte.
+ */
+ str->length(str->length() - 1);
+ PUSH(chr);
+ DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
+ return false; // Bad byte sequence
+ }
+ }
+ DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
+ return false; // Good multi-byte character
+ }
+#endif
+
public:
bool error,line_cuted,found_null,enclosed;
uchar *row_start, /* Found row starts here */
@@ -1474,6 +1549,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
}
+/**
+ Read a field.
+
+ The data in the loaded file was presumably escaped using
+ - either select_export::send_data() OUTFILE
+ - or mysql_real_escape_string()
+ using the same character set with the one specified in the current
+ "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set).
+
+ Note, non-escaped multi-byte characters are scanned as a single entity.
+ This is needed to correctly distinguish between:
+ - 0x5C as an escape character versus
+ - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis)
+
+ Parts of escaped multi-byte characters are scanned on different loop
+ iterations. See the comment about 0x5C handling in select_export::send_data()
+ in sql_class.cc.
+
+ READ_INFO::read_field() does not check wellformedness.
+ Raising wellformedness errors or warnings in READ_INFO::read_field()
+ would be wrong, as the data after unescaping can go into a BLOB field,
+ or into a TEXT/VARCHAR field of a different character set.
+ The loop below only makes sure to revert escaping made by
+ select_export::send_data() or mysql_real_escape_string().
+ Wellformedness is checked later, during Field::store(str,length,cs) time.
+
+ Note, in some cases users can supply data which did not go through
+ escaping properly. For example, utf8 "\<C3><A4>"
+ (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS)
+ is improperly escaped data that could not be generated by
+ select_export::send_data() / mysql_real_escape_string():
+ - either there should be two backslashes: "\\<C3><A4>"
+ - or there should be no backslashes at all: "<C3><A4>"
+ "\<C3>" and "<A4> are scanned on two different loop iterations and
+ store "<C3><A4>" into the field.
+
+ Note, adding useless escapes before multi-byte characters like in the
+ example above is safe in case of utf8, but is not safe in case of
+ character sets that have escape_with_backslash_is_dangerous==TRUE,
+ such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the
+ data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0).
+ If we add an extra escape before this sequence, then we'll get
+ <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>.
+ The second loop iteration will turn <5C><30> into <30>.
+ So the program that generates a dump file for further use with LOAD DATA
+ must make sure to use escapes properly.
+*/
+
int READ_INFO::read_field()
{
int chr,found_enclosed_char;
@@ -1510,7 +1633,8 @@ int READ_INFO::read_field()
for (;;)
{
- while ( to < end_of_buff)
+ // Make sure we have enough space for the longest multi-byte character.
+ while ( to + read_charset->mbmaxlen < end_of_buff)
{
chr = GET;
if (chr == my_b_EOF)
@@ -1598,52 +1722,27 @@ int READ_INFO::read_field()
}
}
#ifdef USE_MB
- uint ml= my_mbcharlen(read_charset, chr);
- if (ml == 0)
- {
- *to= '\0';
- my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
- read_charset->csname, buffer);
- error= true;
- return 1;
- }
-
- if (ml > 1 &&
- to + ml <= end_of_buff)
- {
- uchar* p= to;
- *to++ = chr;
-
- for (uint i= 1; i < ml; i++)
- {
- chr= GET;
- if (chr == my_b_EOF)
- {
- /*
- Need to back up the bytes already ready from illformed
- multi-byte char
- */
- to-= i;
- goto found_eof;
- }
- *to++ = chr;
- }
- if (my_ismbchar(read_charset,
- (const char *)p,
- (const char *)to))
- continue;
- for (uint i= 0; i < ml; i++)
- PUSH(*--to);
- chr= GET;
- }
- else if (ml > 1)
- {
- // Buffer is too small, exit while loop, and reallocate.
- PUSH(chr);
- break;
- }
#endif
*to++ = (uchar) chr;
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2
+#else
+ if (my_mbcharlen(read_charset, (uchar) chr) > 1)
+ {
+ /*
+ A known MBHEAD found. Try to scan the full multi-byte character.
+ Otherwise, a possible following second byte 0x5C would be
+ mis-interpreted as an escape on the next iteration.
+ (Important for big5, gbk, sjis, cp932).
+ */
+ String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
+ tmp.length(1);
+ bool eof= read_mbtail(&tmp);
+ to+= tmp.length() - 1;
+ if (eof)
+ goto found_eof;
+ }
+#endif
}
/*
** We come here if buffer is too small. Enlarge it and continue
diff --git a/sql/sql_string.h b/sql/sql_string.h
index c287f051d98..557d14a79f8 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -136,6 +136,7 @@ public:
inline bool is_empty() const { return (str_length == 0); }
inline void mark_as_const() { Alloced_length= 0;}
inline const char *ptr() const { return Ptr; }
+ inline const char *end() const { return Ptr + str_length; }
inline char *c_ptr()
{
DBUG_ASSERT(!alloced || !Ptr || !Alloced_length ||