summaryrefslogtreecommitdiff
path: root/sql/lex_state.h
diff options
context:
space:
mode:
Diffstat (limited to 'sql/lex_state.h')
-rw-r--r--sql/lex_state.h718
1 files changed, 718 insertions, 0 deletions
diff --git a/sql/lex_state.h b/sql/lex_state.h
new file mode 100644
index 00000000000..bde574f3530
--- /dev/null
+++ b/sql/lex_state.h
@@ -0,0 +1,718 @@
+/*
+ Copyright (c) 2009, 2022, MariaDB Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#ifndef SQL_YYSTYPE_INCLUDED
+#define SQL_YYSTYPE_INCLUDED
+
+#include <my_global.h>
+#include <m_ctype.h>
+#include "mysqld.h"
+#include "lock.h"
+#include "mdl.h"
+#include "sql_signal.h"
+
+
+/*
+ The following hack is needed because yy_*.cc do not define
+ YYSTYPE before including this file
+*/
+#ifdef MYSQL_YACC
+#define LEX_YYSTYPE void *
+#else
+#include "lex_symbol.h"
+#ifdef MYSQL_LEX
+#include "item_func.h" /* Cast_target used in yy_mariadb.hh */
+#include "sql_get_diagnostics.h" /* Types used in yy_mariadb.hh */
+#include "sp_pcontext.h"
+#include "yy_mariadb.hh"
+#define LEX_YYSTYPE YYSTYPE *
+#else
+#define LEX_YYSTYPE void *
+#endif
+#endif
+
+class Lex_string_with_metadata_st;
+class Lex_ident_cli_st;
+struct sql_digest_state;
+/**
+ The state of the lexical parser, when parsing comments.
+*/
+enum enum_comment_state
+{
+ /**
+ Not parsing comments.
+ */
+ NO_COMMENT,
+ /**
+ Parsing comments that need to be preserved.
+ Typically, these are user comments '/' '*' ... '*' '/'.
+ */
+ PRESERVE_COMMENT,
+ /**
+ Parsing comments that need to be discarded.
+ Typically, these are special comments '/' '*' '!' ... '*' '/',
+ or '/' '*' '!' 'M' 'M' 'm' 'm' 'm' ... '*' '/', where the comment
+ markers should not be expanded.
+ */
+ DISCARD_COMMENT
+};
+
+
+/**
+ @brief This class represents the character input stream consumed during
+ lexical analysis.
+
+ In addition to consuming the input stream, this class performs some
+ comment pre processing, by filtering out out of bound special text
+ from the query input stream.
+ Two buffers, with pointers inside each buffers, are maintained in
+ parallel. The 'raw' buffer is the original query text, which may
+ contain out-of-bound comments. The 'cpp' (for comments pre processor)
+ is the pre-processed buffer that contains only the query text that
+ should be seen once out-of-bound data is removed.
+*/
+
+class Lex_input_stream
+{
+ size_t unescape(CHARSET_INFO *cs, char *to,
+ const char *str, const char *end, int sep);
+ my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t sep) const;
+public:
+ Lex_input_stream()
+ {
+ }
+
+ ~Lex_input_stream()
+ {
+ }
+
+ /**
+ Object initializer. Must be called before usage.
+
+ @retval FALSE OK
+ @retval TRUE Error
+ */
+ bool init(THD *thd, char *buff, size_t length);
+
+ void reset(char *buff, size_t length);
+
+ /**
+ The main method to scan the next token, with token contraction processing
+ for LALR(2) resolution, e.g. translate "WITH" followed by "ROLLUP"
+ to a single token WITH_ROLLUP_SYM.
+ */
+ int lex_token(union YYSTYPE *yylval, THD *thd);
+
+ void reduce_digest_token(uint token_left, uint token_right);
+
+private:
+ /**
+ Set the echo mode.
+
+ When echo is true, characters parsed from the raw input stream are
+ preserved. When false, characters parsed are silently ignored.
+ @param echo the echo mode.
+ */
+ void set_echo(bool echo)
+ {
+ m_echo= echo;
+ }
+
+ void save_in_comment_state()
+ {
+ m_echo_saved= m_echo;
+ in_comment_saved= in_comment;
+ }
+
+ void restore_in_comment_state()
+ {
+ m_echo= m_echo_saved;
+ in_comment= in_comment_saved;
+ }
+
+ /**
+ Skip binary from the input stream.
+ @param n number of bytes to accept.
+ */
+ void skip_binary(int n)
+ {
+ if (m_echo)
+ {
+ memcpy(m_cpp_ptr, m_ptr, n);
+ m_cpp_ptr += n;
+ }
+ m_ptr += n;
+ }
+
+ /**
+ Get a character, and advance in the stream.
+ @return the next character to parse.
+ */
+ unsigned char yyGet()
+ {
+ char c= *m_ptr++;
+ if (m_echo)
+ *m_cpp_ptr++ = c;
+ return c;
+ }
+
+ /**
+ Get the last character accepted.
+ @return the last character accepted.
+ */
+ unsigned char yyGetLast()
+ {
+ return m_ptr[-1];
+ }
+
+ /**
+ Look at the next character to parse, but do not accept it.
+ */
+ unsigned char yyPeek()
+ {
+ return m_ptr[0];
+ }
+
+ /**
+ Look ahead at some character to parse.
+ @param n offset of the character to look up
+ */
+ unsigned char yyPeekn(int n)
+ {
+ return m_ptr[n];
+ }
+
+ /**
+ Cancel the effect of the last yyGet() or yySkip().
+ Note that the echo mode should not change between calls to yyGet / yySkip
+ and yyUnget. The caller is responsible for ensuring that.
+ */
+ void yyUnget()
+ {
+ m_ptr--;
+ if (m_echo)
+ m_cpp_ptr--;
+ }
+
+ /**
+ Accept a character, by advancing the input stream.
+ */
+ void yySkip()
+ {
+ if (m_echo)
+ *m_cpp_ptr++ = *m_ptr++;
+ else
+ m_ptr++;
+ }
+
+ /**
+ Accept multiple characters at once.
+ @param n the number of characters to accept.
+ */
+ void yySkipn(int n)
+ {
+ if (m_echo)
+ {
+ memcpy(m_cpp_ptr, m_ptr, n);
+ m_cpp_ptr += n;
+ }
+ m_ptr += n;
+ }
+
+ /**
+ Puts a character back into the stream, canceling
+ the effect of the last yyGet() or yySkip().
+ Note that the echo mode should not change between calls
+ to unput, get, or skip from the stream.
+ */
+ char *yyUnput(char ch)
+ {
+ *--m_ptr= ch;
+ if (m_echo)
+ m_cpp_ptr--;
+ return m_ptr;
+ }
+
+ /**
+ End of file indicator for the query text to parse.
+ @param n number of characters expected
+ @return true if there are less than n characters to parse
+ */
+ bool eof(int n)
+ {
+ return ((m_ptr + n) >= m_end_of_query);
+ }
+
+ /** Mark the stream position as the start of a new token. */
+ void start_token()
+ {
+ m_tok_start_prev= m_tok_start;
+ m_tok_start= m_ptr;
+ m_tok_end= m_ptr;
+
+ m_cpp_tok_start_prev= m_cpp_tok_start;
+ m_cpp_tok_start= m_cpp_ptr;
+ m_cpp_tok_end= m_cpp_ptr;
+ }
+
+ /**
+ Adjust the starting position of the current token.
+ This is used to compensate for starting whitespace.
+ */
+ void restart_token()
+ {
+ m_tok_start= m_ptr;
+ m_cpp_tok_start= m_cpp_ptr;
+ }
+
+ /**
+ Get the maximum length of the utf8-body buffer.
+ The utf8 body can grow because of the character set conversion and escaping.
+ */
+ size_t get_body_utf8_maximum_length(THD *thd);
+
+ /** Get the length of the current token, in the raw buffer. */
+ uint yyLength()
+ {
+ /*
+ The assumption is that the lexical analyser is always 1 character ahead,
+ which the -1 account for.
+ */
+ DBUG_ASSERT(m_ptr > m_tok_start);
+ return (uint) ((m_ptr - m_tok_start) - 1);
+ }
+
+ /**
+ Test if a lookahead token was already scanned by lex_token(),
+ for LALR(2) resolution.
+ */
+ bool has_lookahead() const
+ {
+ return lookahead_token >= 0;
+ }
+
+public:
+
+ /**
+ End of file indicator for the query text to parse.
+ @return true if there are no more characters to parse
+ */
+ bool eof()
+ {
+ return (m_ptr >= m_end_of_query);
+ }
+
+ /** Get the raw query buffer. */
+ const char *get_buf()
+ {
+ return m_buf;
+ }
+
+ /** Get the pre-processed query buffer. */
+ const char *get_cpp_buf()
+ {
+ return m_cpp_buf;
+ }
+
+ /** Get the end of the raw query buffer. */
+ const char *get_end_of_query()
+ {
+ return m_end_of_query;
+ }
+
+ /** Get the token start position, in the raw buffer. */
+ const char *get_tok_start()
+ {
+ return has_lookahead() ? m_tok_start_prev : m_tok_start;
+ }
+
+ void set_cpp_tok_start(const char *pos)
+ {
+ m_cpp_tok_start= pos;
+ }
+
+ /** Get the token end position, in the raw buffer. */
+ const char *get_tok_end()
+ {
+ return m_tok_end;
+ }
+
+ /** Get the current stream pointer, in the raw buffer. */
+ const char *get_ptr()
+ {
+ return m_ptr;
+ }
+
+ /** Get the token start position, in the pre-processed buffer. */
+ const char *get_cpp_tok_start()
+ {
+ return has_lookahead() ? m_cpp_tok_start_prev : m_cpp_tok_start;
+ }
+
+ /** Get the token end position, in the pre-processed buffer. */
+ const char *get_cpp_tok_end()
+ {
+ return m_cpp_tok_end;
+ }
+
+ /**
+ Get the token end position in the pre-processed buffer,
+ with trailing spaces removed.
+ */
+ const char *get_cpp_tok_end_rtrim()
+ {
+ const char *p;
+ for (p= m_cpp_tok_end;
+ p > m_cpp_buf && my_isspace(system_charset_info, p[-1]);
+ p--)
+ { }
+ return p;
+ }
+
+ /** Get the current stream pointer, in the pre-processed buffer. */
+ const char *get_cpp_ptr()
+ {
+ return m_cpp_ptr;
+ }
+
+ /**
+ Get the current stream pointer, in the pre-processed buffer,
+ with traling spaces removed.
+ */
+ const char *get_cpp_ptr_rtrim()
+ {
+ const char *p;
+ for (p= m_cpp_ptr;
+ p > m_cpp_buf && my_isspace(system_charset_info, p[-1]);
+ p--)
+ { }
+ return p;
+ }
+ /** Get the utf8-body string. */
+ const char *get_body_utf8_str()
+ {
+ return m_body_utf8;
+ }
+
+ /** Get the utf8-body length. */
+ size_t get_body_utf8_length()
+ {
+ return (size_t) (m_body_utf8_ptr - m_body_utf8);
+ }
+
+ void body_utf8_start(THD *thd, const char *begin_ptr);
+ void body_utf8_append(const char *ptr);
+ void body_utf8_append(const char *ptr, const char *end_ptr);
+ void body_utf8_append_ident(THD *thd,
+ const Lex_string_with_metadata_st *txt,
+ const char *end_ptr);
+ void body_utf8_append_escape(THD *thd,
+ const LEX_CSTRING *txt,
+ CHARSET_INFO *txt_cs,
+ const char *end_ptr,
+ my_wc_t sep);
+
+private:
+ /**
+ LALR(2) resolution, look ahead token.
+ Value of the next token to return, if any,
+ or -1, if no token was parsed in advance.
+ Note: 0 is a legal token, and represents YYEOF.
+ */
+ int lookahead_token;
+
+ /** LALR(2) resolution, value of the look ahead token.*/
+ LEX_YYSTYPE lookahead_yylval;
+
+ bool get_text(Lex_string_with_metadata_st *to,
+ uint sep, int pre_skip, int post_skip);
+
+ void add_digest_token(uint token, LEX_YYSTYPE yylval);
+
+ bool consume_comment(int remaining_recursions_permitted);
+ int lex_one_token(union YYSTYPE *yylval, THD *thd);
+ int find_keyword(Lex_ident_cli_st *str, uint len, bool function);
+ LEX_CSTRING get_token(uint skip, uint length);
+ int scan_ident_sysvar(THD *thd, Lex_ident_cli_st *str);
+ int scan_ident_start(THD *thd, Lex_ident_cli_st *str);
+ int scan_ident_middle(THD *thd, Lex_ident_cli_st *str,
+ CHARSET_INFO **cs, my_lex_states *);
+ int scan_ident_delimited(THD *thd, Lex_ident_cli_st *str, uchar quote_char);
+ bool get_7bit_or_8bit_ident(THD *thd, uchar *last_char);
+
+ /** Current thread. */
+ THD *m_thd;
+
+ /** Pointer to the current position in the raw input stream. */
+ char *m_ptr;
+
+ /** Starting position of the last token parsed, in the raw buffer. */
+ const char *m_tok_start;
+
+ /** Ending position of the previous token parsed, in the raw buffer. */
+ const char *m_tok_end;
+
+ /** End of the query text in the input stream, in the raw buffer. */
+ const char *m_end_of_query;
+
+ /** Starting position of the previous token parsed, in the raw buffer. */
+ const char *m_tok_start_prev;
+
+ /** Begining of the query text in the input stream, in the raw buffer. */
+ const char *m_buf;
+
+ /** Length of the raw buffer. */
+ size_t m_buf_length;
+
+ /** Echo the parsed stream to the pre-processed buffer. */
+ bool m_echo:1;
+ bool m_echo_saved:1;
+
+ /** Pre-processed buffer. */
+ char *m_cpp_buf;
+
+ /** Pointer to the current position in the pre-processed input stream. */
+ char *m_cpp_ptr;
+
+ /**
+ Starting position of the last token parsed,
+ in the pre-processed buffer.
+ */
+ const char *m_cpp_tok_start;
+
+ /**
+ Starting position of the previous token parsed,
+ in the pre-procedded buffer.
+ */
+ const char *m_cpp_tok_start_prev;
+
+ /**
+ Ending position of the previous token parsed,
+ in the pre-processed buffer.
+ */
+ const char *m_cpp_tok_end;
+
+ /** UTF8-body buffer created during parsing. */
+ char *m_body_utf8;
+
+ /** Pointer to the current position in the UTF8-body buffer. */
+ char *m_body_utf8_ptr;
+
+ /**
+ Position in the pre-processed buffer. The query from m_cpp_buf to
+ m_cpp_utf_processed_ptr is converted to UTF8-body.
+ */
+ const char *m_cpp_utf8_processed_ptr;
+
+public:
+
+ /** Current state of the lexical analyser. */
+ enum my_lex_states next_state;
+
+ /**
+ Position of ';' in the stream, to delimit multiple queries.
+ This delimiter is in the raw buffer.
+ */
+ const char *found_semicolon;
+
+ /** SQL_MODE = IGNORE_SPACE. */
+ bool ignore_space:1;
+
+ /**
+ TRUE if we're parsing a prepared statement: in this mode
+ we should allow placeholders.
+ */
+ bool stmt_prepare_mode:1;
+ /**
+ TRUE if we should allow multi-statements.
+ */
+ bool multi_statements:1;
+
+ /** Current line number. */
+ uint yylineno;
+
+ /**
+ Current statement digest instrumentation.
+ */
+ sql_digest_state* m_digest;
+
+private:
+ /** State of the lexical analyser for comments. */
+ enum_comment_state in_comment;
+ enum_comment_state in_comment_saved;
+
+ /**
+ Starting position of the TEXT_STRING or IDENT in the pre-processed
+ buffer.
+
+ NOTE: this member must be used within MYSQLlex() function only.
+ */
+ const char *m_cpp_text_start;
+
+ /**
+ Ending position of the TEXT_STRING or IDENT in the pre-processed
+ buffer.
+
+ NOTE: this member must be used within MYSQLlex() function only.
+ */
+ const char *m_cpp_text_end;
+
+ /**
+ Character set specified by the character-set-introducer.
+
+ NOTE: this member must be used within MYSQLlex() function only.
+ */
+ CHARSET_INFO *m_underscore_cs;
+};
+
+
+
+/**
+ The internal state of the syntax parser.
+ This object is only available during parsing,
+ and is private to the syntax parser implementation (sql_yacc.yy).
+*/
+class Yacc_state
+{
+public:
+ Yacc_state() : yacc_yyss(NULL), yacc_yyvs(NULL) { reset(); }
+
+ void reset()
+ {
+ if (yacc_yyss != NULL) {
+ my_free(yacc_yyss);
+ yacc_yyss = NULL;
+ }
+ if (yacc_yyvs != NULL) {
+ my_free(yacc_yyvs);
+ yacc_yyvs = NULL;
+ }
+ m_set_signal_info.clear();
+ m_lock_type= TL_READ_DEFAULT;
+ m_mdl_type= MDL_SHARED_READ;
+ }
+
+ ~Yacc_state();
+
+ /**
+ Reset part of the state which needs resetting before parsing
+ substatement.
+ */
+ void reset_before_substatement()
+ {
+ m_lock_type= TL_READ_DEFAULT;
+ m_mdl_type= MDL_SHARED_READ;
+ }
+
+ /**
+ Bison internal state stack, yyss, when dynamically allocated using
+ my_yyoverflow().
+ */
+ uchar *yacc_yyss;
+
+ /**
+ Bison internal semantic value stack, yyvs, when dynamically allocated using
+ my_yyoverflow().
+ */
+ uchar *yacc_yyvs;
+
+ /**
+ Fragments of parsed tree,
+ used during the parsing of SIGNAL and RESIGNAL.
+ */
+ Set_signal_information m_set_signal_info;
+
+ /**
+ Type of lock to be used for tables being added to the statement's
+ table list in table_factor, table_alias_ref, single_multi and
+ table_wild_one rules.
+ Statements which use these rules but require lock type different
+ from one specified by this member have to override it by using
+ st_select_lex::set_lock_for_tables() method.
+
+ The default value of this member is TL_READ_DEFAULT. The only two
+ cases in which we change it are:
+ - When parsing SELECT HIGH_PRIORITY.
+ - Rule for DELETE. In which we use this member to pass information
+ about type of lock from delete to single_multi part of rule.
+
+ We should try to avoid introducing new use cases as we would like
+ to get rid of this member eventually.
+ */
+ thr_lock_type m_lock_type;
+
+ /**
+ The type of requested metadata lock for tables added to
+ the statement table list.
+ */
+ enum_mdl_type m_mdl_type;
+
+ /*
+ TODO: move more attributes from the LEX structure here.
+ */
+};
+
+
+/**
+ Internal state of the parser.
+ The complete state consist of:
+ - state data used during lexical parsing,
+ - state data used during syntactic parsing.
+*/
+class Parser_state
+{
+public:
+ Parser_state()
+ : m_yacc()
+ {}
+
+ /**
+ Object initializer. Must be called before usage.
+
+ @retval FALSE OK
+ @retval TRUE Error
+ */
+ bool init(THD *thd, char *buff, size_t length)
+ {
+ return m_lip.init(thd, buff, length);
+ }
+
+ ~Parser_state()
+ {}
+
+ Lex_input_stream m_lip;
+ Yacc_state m_yacc;
+
+ /**
+ Current performance digest instrumentation.
+ */
+ PSI_digest_locker* m_digest_psi;
+
+ void reset(char *found_semicolon, unsigned int length)
+ {
+ m_lip.reset(found_semicolon, length);
+ m_yacc.reset();
+ }
+};
+
+
+extern sql_digest_state *
+digest_add_token(sql_digest_state *state, uint token, LEX_YYSTYPE yylval);
+
+extern sql_digest_state *
+digest_reduce_token(sql_digest_state *state, uint token_left, uint token_right);
+
+#endif // SQL_YYSTYPE_INCLUDED