diff options
Diffstat (limited to 'sql/lex_state.h')
-rw-r--r-- | sql/lex_state.h | 718 |
1 files changed, 718 insertions, 0 deletions
diff --git a/sql/lex_state.h b/sql/lex_state.h new file mode 100644 index 00000000000..bde574f3530 --- /dev/null +++ b/sql/lex_state.h @@ -0,0 +1,718 @@ +/* + Copyright (c) 2009, 2022, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef SQL_YYSTYPE_INCLUDED +#define SQL_YYSTYPE_INCLUDED + +#include <my_global.h> +#include <m_ctype.h> +#include "mysqld.h" +#include "lock.h" +#include "mdl.h" +#include "sql_signal.h" + + +/* + The following hack is needed because yy_*.cc do not define + YYSTYPE before including this file +*/ +#ifdef MYSQL_YACC +#define LEX_YYSTYPE void * +#else +#include "lex_symbol.h" +#ifdef MYSQL_LEX +#include "item_func.h" /* Cast_target used in yy_mariadb.hh */ +#include "sql_get_diagnostics.h" /* Types used in yy_mariadb.hh */ +#include "sp_pcontext.h" +#include "yy_mariadb.hh" +#define LEX_YYSTYPE YYSTYPE * +#else +#define LEX_YYSTYPE void * +#endif +#endif + +class Lex_string_with_metadata_st; +class Lex_ident_cli_st; +struct sql_digest_state; +/** + The state of the lexical parser, when parsing comments. +*/ +enum enum_comment_state +{ + /** + Not parsing comments. + */ + NO_COMMENT, + /** + Parsing comments that need to be preserved. + Typically, these are user comments '/' '*' ... '*' '/'. + */ + PRESERVE_COMMENT, + /** + Parsing comments that need to be discarded. + Typically, these are special comments '/' '*' '!' ... '*' '/', + or '/' '*' '!' 'M' 'M' 'm' 'm' 'm' ... '*' '/', where the comment + markers should not be expanded. + */ + DISCARD_COMMENT +}; + + +/** + @brief This class represents the character input stream consumed during + lexical analysis. + + In addition to consuming the input stream, this class performs some + comment pre processing, by filtering out out of bound special text + from the query input stream. + Two buffers, with pointers inside each buffers, are maintained in + parallel. The 'raw' buffer is the original query text, which may + contain out-of-bound comments. The 'cpp' (for comments pre processor) + is the pre-processed buffer that contains only the query text that + should be seen once out-of-bound data is removed. +*/ + +class Lex_input_stream +{ + size_t unescape(CHARSET_INFO *cs, char *to, + const char *str, const char *end, int sep); + my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t sep) const; +public: + Lex_input_stream() + { + } + + ~Lex_input_stream() + { + } + + /** + Object initializer. Must be called before usage. + + @retval FALSE OK + @retval TRUE Error + */ + bool init(THD *thd, char *buff, size_t length); + + void reset(char *buff, size_t length); + + /** + The main method to scan the next token, with token contraction processing + for LALR(2) resolution, e.g. translate "WITH" followed by "ROLLUP" + to a single token WITH_ROLLUP_SYM. + */ + int lex_token(union YYSTYPE *yylval, THD *thd); + + void reduce_digest_token(uint token_left, uint token_right); + +private: + /** + Set the echo mode. + + When echo is true, characters parsed from the raw input stream are + preserved. When false, characters parsed are silently ignored. + @param echo the echo mode. + */ + void set_echo(bool echo) + { + m_echo= echo; + } + + void save_in_comment_state() + { + m_echo_saved= m_echo; + in_comment_saved= in_comment; + } + + void restore_in_comment_state() + { + m_echo= m_echo_saved; + in_comment= in_comment_saved; + } + + /** + Skip binary from the input stream. + @param n number of bytes to accept. + */ + void skip_binary(int n) + { + if (m_echo) + { + memcpy(m_cpp_ptr, m_ptr, n); + m_cpp_ptr += n; + } + m_ptr += n; + } + + /** + Get a character, and advance in the stream. + @return the next character to parse. + */ + unsigned char yyGet() + { + char c= *m_ptr++; + if (m_echo) + *m_cpp_ptr++ = c; + return c; + } + + /** + Get the last character accepted. + @return the last character accepted. + */ + unsigned char yyGetLast() + { + return m_ptr[-1]; + } + + /** + Look at the next character to parse, but do not accept it. + */ + unsigned char yyPeek() + { + return m_ptr[0]; + } + + /** + Look ahead at some character to parse. + @param n offset of the character to look up + */ + unsigned char yyPeekn(int n) + { + return m_ptr[n]; + } + + /** + Cancel the effect of the last yyGet() or yySkip(). + Note that the echo mode should not change between calls to yyGet / yySkip + and yyUnget. The caller is responsible for ensuring that. + */ + void yyUnget() + { + m_ptr--; + if (m_echo) + m_cpp_ptr--; + } + + /** + Accept a character, by advancing the input stream. + */ + void yySkip() + { + if (m_echo) + *m_cpp_ptr++ = *m_ptr++; + else + m_ptr++; + } + + /** + Accept multiple characters at once. + @param n the number of characters to accept. + */ + void yySkipn(int n) + { + if (m_echo) + { + memcpy(m_cpp_ptr, m_ptr, n); + m_cpp_ptr += n; + } + m_ptr += n; + } + + /** + Puts a character back into the stream, canceling + the effect of the last yyGet() or yySkip(). + Note that the echo mode should not change between calls + to unput, get, or skip from the stream. + */ + char *yyUnput(char ch) + { + *--m_ptr= ch; + if (m_echo) + m_cpp_ptr--; + return m_ptr; + } + + /** + End of file indicator for the query text to parse. + @param n number of characters expected + @return true if there are less than n characters to parse + */ + bool eof(int n) + { + return ((m_ptr + n) >= m_end_of_query); + } + + /** Mark the stream position as the start of a new token. */ + void start_token() + { + m_tok_start_prev= m_tok_start; + m_tok_start= m_ptr; + m_tok_end= m_ptr; + + m_cpp_tok_start_prev= m_cpp_tok_start; + m_cpp_tok_start= m_cpp_ptr; + m_cpp_tok_end= m_cpp_ptr; + } + + /** + Adjust the starting position of the current token. + This is used to compensate for starting whitespace. + */ + void restart_token() + { + m_tok_start= m_ptr; + m_cpp_tok_start= m_cpp_ptr; + } + + /** + Get the maximum length of the utf8-body buffer. + The utf8 body can grow because of the character set conversion and escaping. + */ + size_t get_body_utf8_maximum_length(THD *thd); + + /** Get the length of the current token, in the raw buffer. */ + uint yyLength() + { + /* + The assumption is that the lexical analyser is always 1 character ahead, + which the -1 account for. + */ + DBUG_ASSERT(m_ptr > m_tok_start); + return (uint) ((m_ptr - m_tok_start) - 1); + } + + /** + Test if a lookahead token was already scanned by lex_token(), + for LALR(2) resolution. + */ + bool has_lookahead() const + { + return lookahead_token >= 0; + } + +public: + + /** + End of file indicator for the query text to parse. + @return true if there are no more characters to parse + */ + bool eof() + { + return (m_ptr >= m_end_of_query); + } + + /** Get the raw query buffer. */ + const char *get_buf() + { + return m_buf; + } + + /** Get the pre-processed query buffer. */ + const char *get_cpp_buf() + { + return m_cpp_buf; + } + + /** Get the end of the raw query buffer. */ + const char *get_end_of_query() + { + return m_end_of_query; + } + + /** Get the token start position, in the raw buffer. */ + const char *get_tok_start() + { + return has_lookahead() ? m_tok_start_prev : m_tok_start; + } + + void set_cpp_tok_start(const char *pos) + { + m_cpp_tok_start= pos; + } + + /** Get the token end position, in the raw buffer. */ + const char *get_tok_end() + { + return m_tok_end; + } + + /** Get the current stream pointer, in the raw buffer. */ + const char *get_ptr() + { + return m_ptr; + } + + /** Get the token start position, in the pre-processed buffer. */ + const char *get_cpp_tok_start() + { + return has_lookahead() ? m_cpp_tok_start_prev : m_cpp_tok_start; + } + + /** Get the token end position, in the pre-processed buffer. */ + const char *get_cpp_tok_end() + { + return m_cpp_tok_end; + } + + /** + Get the token end position in the pre-processed buffer, + with trailing spaces removed. + */ + const char *get_cpp_tok_end_rtrim() + { + const char *p; + for (p= m_cpp_tok_end; + p > m_cpp_buf && my_isspace(system_charset_info, p[-1]); + p--) + { } + return p; + } + + /** Get the current stream pointer, in the pre-processed buffer. */ + const char *get_cpp_ptr() + { + return m_cpp_ptr; + } + + /** + Get the current stream pointer, in the pre-processed buffer, + with traling spaces removed. + */ + const char *get_cpp_ptr_rtrim() + { + const char *p; + for (p= m_cpp_ptr; + p > m_cpp_buf && my_isspace(system_charset_info, p[-1]); + p--) + { } + return p; + } + /** Get the utf8-body string. */ + const char *get_body_utf8_str() + { + return m_body_utf8; + } + + /** Get the utf8-body length. */ + size_t get_body_utf8_length() + { + return (size_t) (m_body_utf8_ptr - m_body_utf8); + } + + void body_utf8_start(THD *thd, const char *begin_ptr); + void body_utf8_append(const char *ptr); + void body_utf8_append(const char *ptr, const char *end_ptr); + void body_utf8_append_ident(THD *thd, + const Lex_string_with_metadata_st *txt, + const char *end_ptr); + void body_utf8_append_escape(THD *thd, + const LEX_CSTRING *txt, + CHARSET_INFO *txt_cs, + const char *end_ptr, + my_wc_t sep); + +private: + /** + LALR(2) resolution, look ahead token. + Value of the next token to return, if any, + or -1, if no token was parsed in advance. + Note: 0 is a legal token, and represents YYEOF. + */ + int lookahead_token; + + /** LALR(2) resolution, value of the look ahead token.*/ + LEX_YYSTYPE lookahead_yylval; + + bool get_text(Lex_string_with_metadata_st *to, + uint sep, int pre_skip, int post_skip); + + void add_digest_token(uint token, LEX_YYSTYPE yylval); + + bool consume_comment(int remaining_recursions_permitted); + int lex_one_token(union YYSTYPE *yylval, THD *thd); + int find_keyword(Lex_ident_cli_st *str, uint len, bool function); + LEX_CSTRING get_token(uint skip, uint length); + int scan_ident_sysvar(THD *thd, Lex_ident_cli_st *str); + int scan_ident_start(THD *thd, Lex_ident_cli_st *str); + int scan_ident_middle(THD *thd, Lex_ident_cli_st *str, + CHARSET_INFO **cs, my_lex_states *); + int scan_ident_delimited(THD *thd, Lex_ident_cli_st *str, uchar quote_char); + bool get_7bit_or_8bit_ident(THD *thd, uchar *last_char); + + /** Current thread. */ + THD *m_thd; + + /** Pointer to the current position in the raw input stream. */ + char *m_ptr; + + /** Starting position of the last token parsed, in the raw buffer. */ + const char *m_tok_start; + + /** Ending position of the previous token parsed, in the raw buffer. */ + const char *m_tok_end; + + /** End of the query text in the input stream, in the raw buffer. */ + const char *m_end_of_query; + + /** Starting position of the previous token parsed, in the raw buffer. */ + const char *m_tok_start_prev; + + /** Begining of the query text in the input stream, in the raw buffer. */ + const char *m_buf; + + /** Length of the raw buffer. */ + size_t m_buf_length; + + /** Echo the parsed stream to the pre-processed buffer. */ + bool m_echo:1; + bool m_echo_saved:1; + + /** Pre-processed buffer. */ + char *m_cpp_buf; + + /** Pointer to the current position in the pre-processed input stream. */ + char *m_cpp_ptr; + + /** + Starting position of the last token parsed, + in the pre-processed buffer. + */ + const char *m_cpp_tok_start; + + /** + Starting position of the previous token parsed, + in the pre-procedded buffer. + */ + const char *m_cpp_tok_start_prev; + + /** + Ending position of the previous token parsed, + in the pre-processed buffer. + */ + const char *m_cpp_tok_end; + + /** UTF8-body buffer created during parsing. */ + char *m_body_utf8; + + /** Pointer to the current position in the UTF8-body buffer. */ + char *m_body_utf8_ptr; + + /** + Position in the pre-processed buffer. The query from m_cpp_buf to + m_cpp_utf_processed_ptr is converted to UTF8-body. + */ + const char *m_cpp_utf8_processed_ptr; + +public: + + /** Current state of the lexical analyser. */ + enum my_lex_states next_state; + + /** + Position of ';' in the stream, to delimit multiple queries. + This delimiter is in the raw buffer. + */ + const char *found_semicolon; + + /** SQL_MODE = IGNORE_SPACE. */ + bool ignore_space:1; + + /** + TRUE if we're parsing a prepared statement: in this mode + we should allow placeholders. + */ + bool stmt_prepare_mode:1; + /** + TRUE if we should allow multi-statements. + */ + bool multi_statements:1; + + /** Current line number. */ + uint yylineno; + + /** + Current statement digest instrumentation. + */ + sql_digest_state* m_digest; + +private: + /** State of the lexical analyser for comments. */ + enum_comment_state in_comment; + enum_comment_state in_comment_saved; + + /** + Starting position of the TEXT_STRING or IDENT in the pre-processed + buffer. + + NOTE: this member must be used within MYSQLlex() function only. + */ + const char *m_cpp_text_start; + + /** + Ending position of the TEXT_STRING or IDENT in the pre-processed + buffer. + + NOTE: this member must be used within MYSQLlex() function only. + */ + const char *m_cpp_text_end; + + /** + Character set specified by the character-set-introducer. + + NOTE: this member must be used within MYSQLlex() function only. + */ + CHARSET_INFO *m_underscore_cs; +}; + + + +/** + The internal state of the syntax parser. + This object is only available during parsing, + and is private to the syntax parser implementation (sql_yacc.yy). +*/ +class Yacc_state +{ +public: + Yacc_state() : yacc_yyss(NULL), yacc_yyvs(NULL) { reset(); } + + void reset() + { + if (yacc_yyss != NULL) { + my_free(yacc_yyss); + yacc_yyss = NULL; + } + if (yacc_yyvs != NULL) { + my_free(yacc_yyvs); + yacc_yyvs = NULL; + } + m_set_signal_info.clear(); + m_lock_type= TL_READ_DEFAULT; + m_mdl_type= MDL_SHARED_READ; + } + + ~Yacc_state(); + + /** + Reset part of the state which needs resetting before parsing + substatement. + */ + void reset_before_substatement() + { + m_lock_type= TL_READ_DEFAULT; + m_mdl_type= MDL_SHARED_READ; + } + + /** + Bison internal state stack, yyss, when dynamically allocated using + my_yyoverflow(). + */ + uchar *yacc_yyss; + + /** + Bison internal semantic value stack, yyvs, when dynamically allocated using + my_yyoverflow(). + */ + uchar *yacc_yyvs; + + /** + Fragments of parsed tree, + used during the parsing of SIGNAL and RESIGNAL. + */ + Set_signal_information m_set_signal_info; + + /** + Type of lock to be used for tables being added to the statement's + table list in table_factor, table_alias_ref, single_multi and + table_wild_one rules. + Statements which use these rules but require lock type different + from one specified by this member have to override it by using + st_select_lex::set_lock_for_tables() method. + + The default value of this member is TL_READ_DEFAULT. The only two + cases in which we change it are: + - When parsing SELECT HIGH_PRIORITY. + - Rule for DELETE. In which we use this member to pass information + about type of lock from delete to single_multi part of rule. + + We should try to avoid introducing new use cases as we would like + to get rid of this member eventually. + */ + thr_lock_type m_lock_type; + + /** + The type of requested metadata lock for tables added to + the statement table list. + */ + enum_mdl_type m_mdl_type; + + /* + TODO: move more attributes from the LEX structure here. + */ +}; + + +/** + Internal state of the parser. + The complete state consist of: + - state data used during lexical parsing, + - state data used during syntactic parsing. +*/ +class Parser_state +{ +public: + Parser_state() + : m_yacc() + {} + + /** + Object initializer. Must be called before usage. + + @retval FALSE OK + @retval TRUE Error + */ + bool init(THD *thd, char *buff, size_t length) + { + return m_lip.init(thd, buff, length); + } + + ~Parser_state() + {} + + Lex_input_stream m_lip; + Yacc_state m_yacc; + + /** + Current performance digest instrumentation. + */ + PSI_digest_locker* m_digest_psi; + + void reset(char *found_semicolon, unsigned int length) + { + m_lip.reset(found_semicolon, length); + m_yacc.reset(); + } +}; + + +extern sql_digest_state * +digest_add_token(sql_digest_state *state, uint token, LEX_YYSTYPE yylval); + +extern sql_digest_state * +digest_reduce_token(sql_digest_state *state, uint token_left, uint token_right); + +#endif // SQL_YYSTYPE_INCLUDED |