/**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtXmlPatterns module of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 3 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL3 included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 3 requirements ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 2.0 or (at your option) the GNU General ** Public license version 3 or any later version approved by the KDE Free ** Qt Foundation. The licenses are as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-2.0.html and ** https://www.gnu.org/licenses/gpl-3.0.html. ** ** $QT_END_LICENSE$ ** ****************************************************************************/ // // W A R N I N G // ------------- // // This file is not part of the Qt API. It exists purely as an // implementation detail. This header file may change from version to // version without notice, or even be removed. // // We mean it. #ifndef Patternist_XQueryTokenizer_H #define Patternist_XQueryTokenizer_H #include #include #include #include #include #include QT_BEGIN_NAMESPACE namespace QPatternist { struct TokenMap; /** * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, * and delivers tokens to the Bison generated parser. * * @author Frans Englich */ class XQueryTokenizer : public Tokenizer { public: /** * Tokenizer states. Organized alphabetically. */ enum State { AfterAxisSeparator, AposAttributeContent, Axis, Default, ElementContent, EndTag, ItemType, KindTest, KindTestForPI, NamespaceDecl, NamespaceKeyword, OccurrenceIndicator, Operator, Pragma, PragmaContent, ProcessingInstructionContent, ProcessingInstructionName, QuotAttributeContent, StartTag, VarName, XMLComment, XMLSpaceDecl, XQueryVersion }; XQueryTokenizer(const QString &query, const QUrl &location, const State startingState = Default); virtual Token nextToken(YYLTYPE *const sourceLocator); virtual int commenceScanOnly(); virtual void resumeTokenizationFrom(const int position); /** * Does nothing. */ virtual void setParserContext(const ParserContext::Ptr &parseInfo); private: /** * Returns the character corresponding to the builtin reference @p * reference. For instance, passing @c gt will give you '>' in return. * * If @p reference is an invalid character reference, a null QChar is * returned. * * @see QChar::isNull() */ QChar charForReference(const QString &reference); inline Token tokenAndChangeState(const TokenType code, const State state, const int advance = 1); inline Token tokenAndChangeState(const TokenType code, const QString &value, const State state); inline Token tokenAndAdvance(const TokenType code, const int advance = 1); QString tokenizeCharacterReference(); inline Token tokenizeStringLiteral(); inline Token tokenizeNumberLiteral(); /** * @returns the character @p length characters from the current * position. */ inline char peekAhead(const int length = 1) const; /** * @returns whether the stream, starting from @p offset from the * current position, matches @p chs. The length of @p chs is @p len. */ inline bool aheadEquals(const char *const chs, const int len, const int offset = 1) const; inline Token tokenizeNCName(); static inline bool isOperatorKeyword(const TokenType); static inline bool isDigit(const char ch); static inline Token error(); inline TokenType consumeWhitespace(); /** * @short Returns the character at the current position, converted to * @c ASCII. * * Equivalent to calling: * * @code * current().toLatin1(); * @endcode */ inline char peekCurrent() const; /** * Disregarding encoding conversion, equivalent to calling: * * @code * peekAhead(0); * @endcode */ inline const QChar current() const; /** * @p hadWhitespace is always set to a proper value. * * @returns the length of whitespace scanned before reaching "::", or * -1 if something else was found. */ int peekForColonColon() const; static inline bool isNCNameStart(const QChar ch); static inline bool isNCNameBody(const QChar ch); static inline const TokenMap *lookupKeyword(const QString &keyword); inline void popState(); inline void pushState(const State state); inline State state() const; inline void setState(const State s); static bool isTypeToken(const TokenType t); inline Token tokenizeNCNameOrQName(); /** * Advances m_pos until content is encountered. * * Returned is the length stretching from m_pos when starting, until * @p content is encountered. @p content is not included in the length. */ int scanUntil(const char *const content); /** * Same as calling: * @code * pushState(currentState()); * @endcode */ inline void pushState(); /** * Consumes only whitespace, in the traditional sense. The function exits * if non-whitespace is encountered, such as the start of a comment. * * @returns @c true if the end was reached, otherwise @c false */ inline bool consumeRawWhitespace(); /** * @short Parses comments: (: comment content :). It recurses for * parsing nested comments. * * It is assumed that the start token for the comment, "(:", has * already been parsed. * * Typically, don't call this function, but ignoreWhitespace(). * * @see XML Path Language (XPath) * 2.0, 2.6 Comments * @returns * - SUCCESS if everything went ok * - ERROR if there was an error in parsing one or more comments * - END_OF_FILE if the end was reached */ Tokenizer::TokenType consumeComment(); /** * Determines whether @p code is a keyword * that is followed by a second keyword. For instance declare * function. */ static inline bool isPhraseKeyword(const TokenType code); /** * A set of indexes into a QString, the one being passed to * normalizeEOL() whose characters shouldn't be normalized. */ typedef QSet CharacterSkips; /** * Returns @p input, normalized according to * XQuery 1.0: * An XML Query Language, A.2.3 End-of-Line Handling */ static QString normalizeEOL(const QString &input, const CharacterSkips &characterSkips); inline bool atEnd() const { return m_pos == m_length; } Token nextToken(); /** * Instead of recognizing and tokenizing embedded expressions in * direct attriute constructors, this function is essentially a mini * recursive-descent parser that has the necessary logic to recognize * embedded expressions and their potentially interfering string literals, in * order to scan to the very end of the attribute value, and return the * whole as a string. * * There is of course syntax errors this function will not detect, but * that is ok since the attributes will be parsed once more. * * An inelegant solution, but which gets the job done. * * @see commenceScanOnly(), resumeTokenizationFrom() */ Token attributeAsRaw(const QChar separator, int &stack, const int startPos, const bool inLiteral, QString &result); const QString m_data; const int m_length; State m_state; QStack m_stateStack; int m_pos; /** * The current line number. * * The line number and column number both starts at 1. */ int m_line; /** * The offset into m_length for where * the current column starts. So m_length - m_columnOffset * is the current column. * * The line number and column number both starts at 1. */ int m_columnOffset; const NamePool::Ptr m_namePool; QStack m_tokenStack; QHash m_charRefs; bool m_scanOnly; Q_DISABLE_COPY(XQueryTokenizer) }; } QT_END_NAMESPACE #endif