diff options
author | Qt by Nokia <qt-info@nokia.com> | 2011-04-27 12:05:43 +0200 |
---|---|---|
committer | axis <qt-info@nokia.com> | 2011-04-27 12:05:43 +0200 |
commit | e1b2c9deb5943faae2b29be6a5c006f75bb73f06 (patch) | |
tree | fc79e45367c0a8fc71185e9afc33f7503a58653c /src/xmlpatterns/parser/qxquerytokenizer.cpp | |
download | qtxmlpatterns-e1b2c9deb5943faae2b29be6a5c006f75bb73f06.tar.gz |
Initial import from the monolithic Qt.
This is the beginning of revision history for this module. If you
want to look at revision history older than this, please refer to the
Qt Git wiki for how to use Git history grafting. At the time of
writing, this wiki is located here:
http://qt.gitorious.org/qt/pages/GitIntroductionWithQt
If you have already performed the grafting and you don't see any
history beyond this commit, try running "git log" with the "--follow"
argument.
Branched from the monolithic repo, Qt master branch, at commit
896db169ea224deb96c59ce8af800d019de63f12
Diffstat (limited to 'src/xmlpatterns/parser/qxquerytokenizer.cpp')
-rw-r--r-- | src/xmlpatterns/parser/qxquerytokenizer.cpp | 2249 |
1 files changed, 2249 insertions, 0 deletions
diff --git a/src/xmlpatterns/parser/qxquerytokenizer.cpp b/src/xmlpatterns/parser/qxquerytokenizer.cpp new file mode 100644 index 0000000..13f5a6e --- /dev/null +++ b/src/xmlpatterns/parser/qxquerytokenizer.cpp @@ -0,0 +1,2249 @@ +/**************************************************************************** +** +** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the QtXmlPatterns module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include <QByteArray> + +#include "qquerytransformparser_p.h" + +#include "qxquerytokenizer_p.h" + +#include "qtokenlookup.cpp" + +QT_BEGIN_NAMESPACE + +namespace QPatternist +{ + +#define handleWhitespace() \ +{ \ + const TokenType t = consumeWhitespace(); \ + if(t != SUCCESS) \ + return Token(t); \ +} + +XQueryTokenizer::XQueryTokenizer(const QString &query, + const QUrl &location, + const State startingState) : Tokenizer(location) + , m_data(query) + , m_length(query.length()) + , m_state(startingState) + , m_pos(0) + , m_line(1) + , m_columnOffset(0) + , m_scanOnly(false) +{ + Q_ASSERT(location.isValid() || location.isEmpty()); +} + +const QChar XQueryTokenizer::current() const +{ + if(m_pos < m_length) + return m_data.at(m_pos); + else + return QChar(); +} + +char XQueryTokenizer::peekCurrent() const +{ + return current().toAscii(); +} + +int XQueryTokenizer::peekForColonColon() const +{ + /* Note, we don't modify m_pos in this function, so we need to do offset + * calculations. */ + int pos = m_pos; + + while(pos < m_length) + { + switch(m_data.at(pos).toAscii()) + { + /* Fallthrough these four. */ + case ' ': + case '\t': + case '\n': + case '\r': + break; + case ':': + { + if(peekAhead((pos - m_pos) + 1) == ':') + return pos - m_pos; + /* Fallthrough. */ + } + default: + return -1; + } + ++pos; + } + + return -1; +} + +Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, + const State s, + const int advance) +{ + Q_ASSERT(advance >= 0); + m_pos += advance; + setState(s); + return Token(code); +} + +Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, + const QString &value, + const State s) +{ + setState(s); + return Token(code, value); +} + +Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code, + const int advance) +{ + Q_ASSERT(advance >= 0); + m_pos += advance; + return Token(code); +} + +QString XQueryTokenizer::normalizeEOL(const QString &input, + const CharacterSkips &characterSkips) +{ + const int len = input.count(); + QString result; + + /* The likely hood is rather high it'll be the same content. */ + result.reserve(len); + + for(int i = 0; i < len; ++i) + { + const QChar &at = input.at(i); + + if(characterSkips.contains(i)) + { + result.append(at); + continue; + } + switch(input.at(i).unicode()) + { + case '\r': + { + if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n')) + ++i; + + /* Else, fallthrough. */ + } + case '\n': + { + result.append(QLatin1Char('\n')); + continue; + } + default: + { + result.append(at); + } + } + } + + return result; +} + +Tokenizer::TokenType XQueryTokenizer::consumeComment() +{ + /* Below, we return ERROR instead of END_OF_FILE such that the parser + * sees an invalid comment. */ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ':': + { + ++m_pos; /* Consume ':' */ + if(atEnd()) + return ERROR; + + if(peekCurrent() == ')') + { + ++m_pos; /* Consume ')' */ + return SUCCESS; /* The comment closed nicely. */ + } + continue; /* We don't want to increment m_pos twice. */ + } + case '(': + { /* It looks like the start of a comment. */ + ++m_pos; + + if(atEnd()) + return END_OF_FILE; + else if(peekCurrent() == ':') + { + /* And it is a nested comment -- parse it. */ + const TokenType retval = consumeComment(); + if(retval == SUCCESS) + continue; /* Continue with our "own" comment. */ + else + return retval; /* Return the error in the nested comment. */ + } + break; + } + case '\n': + /* Fallthrough. */ + case '\r': + { + /* We want to count \r\n as a single line break. */ + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + } + ++m_pos; + } + + return ERROR; /* Error: we reached the end while inside a comment. */ +} + +bool XQueryTokenizer::consumeRawWhitespace() +{ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ' ': + case '\t': + break; + case '\n': + case '\r': + { + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + default: + return false; + } + ++m_pos; + } + return true; +} + +Tokenizer::TokenType XQueryTokenizer::consumeWhitespace() +{ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ' ': + case '\t': + break; + case '\n': + case '\r': + { + /* We want to count \r\n as a single line break. */ + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + case '(': + { + if(peekAhead() == ':') + { + m_pos += 2; /* Consume "(:" */ + + const TokenType comment = consumeComment(); + if(comment == SUCCESS) + continue; + else + return comment; + } + } + default: + return SUCCESS; + } + ++m_pos; + } + + return END_OF_FILE; +} + +char XQueryTokenizer::peekAhead(const int length) const +{ + if(m_pos + length < m_length) + return m_data.at(m_pos + length).toAscii(); + else + return 0; +} + +Tokenizer::Token XQueryTokenizer::error() +{ + return Token(ERROR); +} + +bool XQueryTokenizer::isDigit(const char ch) +{ + return ch >= '0' && ch <= '9'; +} + +/* Replace with function in QXmlUtils. Write test cases for this. */ +bool XQueryTokenizer::isNCNameStart(const QChar ch) +{ + if(ch == QLatin1Char('_')) + return true; + + switch(ch.category()) + { + case QChar::Letter_Lowercase: + case QChar::Letter_Uppercase: + case QChar::Letter_Other: + case QChar::Letter_Titlecase: + case QChar::Number_Letter: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isNCNameBody(const QChar ch) +{ + switch(ch.unicode()) + { + case '.': + case '_': + case '-': + return true; + } + + switch(ch.category()) + { + case QChar::Letter_Lowercase: + case QChar::Letter_Uppercase: + case QChar::Letter_Other: + case QChar::Letter_Titlecase: + case QChar::Number_Letter: + case QChar::Mark_SpacingCombining: + case QChar::Mark_Enclosing: + case QChar::Mark_NonSpacing: + case QChar::Letter_Modifier: + case QChar::Number_DecimalDigit: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isPhraseKeyword(const TokenType code) +{ + switch(code) + { + /* Fallthrough all these. */ + case CASTABLE: + case CAST: + case COPY_NAMESPACES: + case DECLARE: + case EMPTY: + case MODULE: + case IMPORT: + case INSTANCE: + case ORDER: + case ORDERING: + case XQUERY: + case STABLE: + case TREAT: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isOperatorKeyword(const TokenType code) +{ + switch(code) + { + /* Fallthrough all these. */ + case AS: + case ASCENDING: + case AT: + case CASE: + case CAST: + case CASTABLE: + case EQ: + case EXTERNAL: + case GE: + case G_EQ: + case G_GT: + case G_LT: + case G_NE: + case GT: + case IN: + case INHERIT: + case INSTANCE: + case IS: + case ITEM: + case LE: + case LT: + case NE: + case NO_INHERIT: + case NO_PRESERVE: + case OF: + case PRESERVE: + case RETURN: + case STABLE: + case TO: + case TREAT: + return true; + default: + return false; + }; +} + +bool XQueryTokenizer::isTypeToken(const TokenType t) +{ + switch(t) + { + /* Fallthrough all these. */ + case ATTRIBUTE: + case COMMENT: + case DOCUMENT: + case DOCUMENT_NODE: + case ELEMENT: + case ITEM: + case NODE: + case PROCESSING_INSTRUCTION: + case SCHEMA_ATTRIBUTE: + case SCHEMA_ELEMENT: + case TEXT: + return true; + default: + return false; + } +} + +Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName() +{ + const int start = m_pos; + + const Token t1 = tokenizeNCName(); + if(t1.hasError()) + return t1; + + if(peekCurrent() != ':' || peekAhead() == '=') + return t1; + + ++m_pos; + + const Token t2 = tokenizeNCName(); + if(t2.hasError()) + return t2; + else + return Token(QNAME, m_data.mid(start, m_pos - start)); +} + +Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral() +{ + setState(Operator); + const int startPos = m_pos; + bool hasDot = false; + bool isXPath20 = false; + + for(; m_pos < m_length; ++m_pos) + { + QChar ch(current()); + + char cell = ch.cell(); + + if(cell == 'e' || cell == 'E') + { + isXPath20 = true; + ++m_pos; + ch = current(); + + if(ch.row() != 0) + break; + + cell = ch.cell(); + + if(cell == '+' || cell == '-') + continue; + } + + if(isNCNameStart(ch)) + return error(); + + if(cell < '0' || cell > '9') + { + if(cell == '.' && !hasDot) + hasDot = true; + else + break; + } + } + + return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos)); +} + +QString XQueryTokenizer::tokenizeCharacterReference() +{ + Q_ASSERT(peekCurrent() == '&'); + + const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1); + + if(theEnd == -1) /* No ';' found, a syntax error. i18n. */ + return QString(); + + QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1)); + m_pos = theEnd; + + const QChar charRef(charForReference(content)); + + if(!charRef.isNull()) + return charRef; + else if(content.startsWith(QLatin1Char('#'))) + { + int base; + + /* It is only '#' or '#x'. */ + if(content.length() < 2) + return QString(); + + /* We got a hex number if it starts with 'x', otherwise it's a decimal. */ + if(content.at(1) == QLatin1Char('x')) + { + base = 16; + content = content.mid(2); /* Remove "#x". */ + } + else + { + base = 10; + content = content.mid(1); /* Remove "#". */ + } + + bool conversionOK = false; + const int codepoint = content.toInt(&conversionOK, base); + + if(conversionOK) + { + const QChar ch(codepoint); + + if(ch.isNull()) + { + /* We likely have something which require surrogate pairs. */ + QString result; + result += QChar(QChar::highSurrogate(codepoint)); + result += QChar(QChar::lowSurrogate(codepoint)); + return result; + } + else + return ch; + } + else + return QString(); + } + else + return QString(); +} + +int XQueryTokenizer::scanUntil(const char *const content) +{ + const int end = m_data.indexOf(QString::fromLatin1(content), m_pos); + + if(end == -1) + return -1; + else + { + const int len = end - m_pos; + m_pos += len; + return len; + } +} + +QChar XQueryTokenizer::charForReference(const QString &reference) +{ + if(m_charRefs.isEmpty()) + { + /* Initialize. */ + m_charRefs.reserve(5); + m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<')); + m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>')); + m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&')); + m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"')); + m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\'')); + } + + return m_charRefs.value(reference); +} + +Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral() +{ + const QChar delimiter(current()); + /* We cannot unfortunately just scan and then do mid(), + * since we can encounter character references. */ + QString result; + + /* This is more likely than QString's default allocation. */ + result.reserve(8); + + CharacterSkips skipEOLNormalization; + + /* Advance over the initial quote character. */ + ++m_pos; + + for(; m_pos < m_length; ++m_pos) + { + const QChar c(current()); + + if(c == QLatin1Char('&')) + { + const QString charRef(tokenizeCharacterReference()); + + if(charRef.isNull()) + return error(); + else + { + skipEOLNormalization.insert(result.count()); + result.append(charRef); + } + + } + else if(c == delimiter) + { + /* Maybe the escaping mechanism is used. For instance, "s""s" + * has the value `s"s'. */ + ++m_pos; + + if(current() == delimiter) /* Double quote. */ + result += delimiter; + else + return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization)); + } + else + result += c; + } + + return error(); +} + +Tokenizer::Token XQueryTokenizer::tokenizeNCName() +{ + const int startPos = m_pos; + + if(m_pos < m_length && isNCNameStart(current())) + { + ++m_pos; + + for(; m_pos < m_length; ++m_pos) + { + if(!isNCNameBody(current())) + break; + } + + return Token(NCNAME, m_data.mid(startPos, m_pos - startPos)); + } + else + return error(); +} + +bool XQueryTokenizer::aheadEquals(const char *const chs, + const int len, + const int offset) const +{ + Q_ASSERT(len > 0); + Q_ASSERT(qstrlen(chs) == uint(len)); + + if(m_pos + len >= m_length) + return false; + + for(int i = offset; i < (len + offset); ++i) + { + if(m_data.at(m_pos + i).toAscii() != chs[i - offset]) + return false; + } + + return true; +} + +const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword) +{ + return TokenLookup::value(keyword.toAscii().constData(), keyword.length()); +} + +XQueryTokenizer::State XQueryTokenizer::state() const +{ + return m_state; +} + +void XQueryTokenizer::setState(const State s) +{ + m_state = s; +} + +void XQueryTokenizer::pushState(const State s) +{ + m_stateStack.push(s); +} + +void XQueryTokenizer::pushState() +{ + m_stateStack.push(m_state); +} + +void XQueryTokenizer::popState() +{ + /* QStack::pop() asserts if it's empty, so we need to check + * it, since we might receive unbalanced curlies. */ + if(!m_stateStack.isEmpty()) + m_state = m_stateStack.pop(); +} + +Tokenizer::Token XQueryTokenizer::nextToken() +{ + switch(state()) + { + /* We want to skip or do special whitespace handling for these + * states. So fallthrough all of the following. */ + case AposAttributeContent: + case Axis: + case ElementContent: + case EndTag: + case Pragma: + case PragmaContent: + case ProcessingInstructionName: + case QuotAttributeContent: + case StartTag: + case XMLComment: + break; + default: + handleWhitespace(); + } + + switch(state()) + { + case XMLSpaceDecl: + /* Fallthrough. */ + case NamespaceKeyword: + { + switch(peekCurrent()) + { + case ',': + return tokenAndAdvance(COMMA); + case '"': + /* Fallthrough. */ + case '\'': + { + setState(NamespaceDecl); + return tokenizeStringLiteral(); + } + } + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + if(keyword) + { + switch(keyword->token) + { + case INHERIT: + /* Fallthrough. */ + case NO_INHERIT: + { + setState(Default); + break; + } + case NAMESPACE: + { + setState(NamespaceDecl); + break; + } + case ORDERED: + /* Fallthrough. */ + case UNORDERED: + /* Fallthrough. */ + case STRIP: + { + setState(Default); + break; + } + case PRESERVE: + { + if(state() != NamespaceKeyword) + setState(Default); + } + default: + break; + } + + return Token(keyword->token); + } + else + return id; + + Q_ASSERT(false); + } + case NamespaceDecl: + { + switch(peekCurrent()) + { + case '=': + return tokenAndAdvance(G_EQ); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + case '\'': + /* Fallthrough. */ + case '\"': + return tokenizeStringLiteral(); + } + + const Token nc(tokenizeNCName()); + + handleWhitespace(); + + const char pc = peekCurrent(); + const TokenMap* const t = lookupKeyword(nc.value); + + if(pc == '\'' || (pc == '"' && t)) + return tokenAndChangeState(t->token, Default, 0); + else + return nc; + + Q_ASSERT(false); + } + case Axis: + { + if(peekCurrent() == ':') + { + Q_ASSERT(peekAhead() == ':'); + m_pos += 2; + setState(AfterAxisSeparator); + return Token(COLONCOLON); + } + /* Fallthrough. */ + } + case AfterAxisSeparator: + /* Fallthrough. */ + case Default: + /* State Operator and state Default have a lot of tokens in common except + * for minor differences. So we treat them the same way, and sprinkles logic + * here and there to handle the small differences. */ + /* Fallthrough. */ + case Operator: + { + switch(peekCurrent()) + { + case '=': + return tokenAndChangeState(G_EQ, Default); + case '-': + return tokenAndChangeState(MINUS, Default); + case '+': + return tokenAndChangeState(PLUS, Default); + case '[': + return tokenAndChangeState(LBRACKET, Default); + case ']': + return tokenAndChangeState(RBRACKET, Operator); + case ',': + return tokenAndChangeState(COMMA, Default); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + case '$': + return tokenAndChangeState(DOLLAR, VarName); + case '|': + return tokenAndChangeState(BAR, Default); + case '?': + return tokenAndChangeState(QUESTION, Operator); + case ')': + return tokenAndChangeState(RPAREN, Operator); + case '@': + return tokenAndChangeState(AT_SIGN, Default); + /* Fallthrough all these. */ + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '0': + return tokenizeNumberLiteral(); + case '.': + { + const char next = peekAhead(); + if(next == '.') + return tokenAndChangeState(DOTDOT, Operator, 2); + /* .5 is allowed, as short form for 0.5: + * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt> + */ + else if(isDigit(next)) + return tokenizeNumberLiteral(); + else + return tokenAndChangeState(DOT, Operator); + } + case '\'': + /* Fallthrough. */ + case '"': + { + setState(Operator); + return tokenizeStringLiteral(); + + } + case '(': + { + if(peekAhead() == '#') + return tokenAndChangeState(PRAGMA_START, Pragma, 2); + else + return tokenAndChangeState(LPAREN, Default); + } + case '*': + { + if(peekAhead() == ':') + { + m_pos += 2; /* Consume *:. */ + const Token nc = tokenizeNCName(); + + if(nc.hasError()) + return error(); + else + return tokenAndChangeState(ANY_PREFIX, nc.value, Operator); + } + else + return tokenAndChangeState(STAR, state() == Default ? Operator : Default); + } + case ':': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(ASSIGN, Default, 2); + case ':': + return tokenAndChangeState(COLONCOLON, Default, 2); + default: + return error(); + } + } + case '!': + { + if(peekAhead() == '=') + return tokenAndChangeState(G_NE, Default, 2); + else + return error(); + } + case '<': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(G_LE, Default, 2); + case '<': + return tokenAndChangeState(PRECEDES, Default, 2); + case '?': + { + pushState(Operator); + return tokenAndChangeState(PI_START, ProcessingInstructionName, 2); + } + case '!': + { + if(aheadEquals("!--", 3)) + { + m_pos += 3; /* Consume "!--". */ + pushState(Operator); + return tokenAndChangeState(COMMENT_START, XMLComment); + } + /* Fallthrough. It's a syntax error, and this is a good way to report it. */ + } + default: + { + if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1))) + { + /* We assume it's an element constructor. */ + pushState(Operator); + } + + return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag); + } + } + } + case '>': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(G_GE, Default, 2); + case '>': + return tokenAndChangeState(FOLLOWS, Default, 2); + default: + return tokenAndChangeState(G_GT, Default); + } + } + case '/': + { + if(peekAhead() == '/') + return tokenAndChangeState(SLASHSLASH, Default, 2); + else + return tokenAndChangeState(SLASH, Default); + } + case '{': + { + pushState(Operator); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + case '}': + { + popState(); + + return tokenAndAdvance(CURLY_RBRACE); + } + } + + /* Ok. We're in state Default or Operator, and it wasn't a simple + * character. */ + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + + if(state() == Operator) + { + if(keyword) + { + if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING) + setState(Operator); + else if(keyword->token == RETURN) + setState(Default); + else if(isPhraseKeyword(keyword->token)) + { + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) + return error(); + + const Token id2(tokenizeNCName()); + const TokenMap *const keyword2 = lookupKeyword(id2.value); + + if(keyword2) + { + if(keyword->token == TREAT && keyword2->token == AS) + setState(ItemType); + else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY) + setState(Default); + + m_tokenStack.push(Token(keyword2->token)); + } + else + m_tokenStack.push(id2); + + return Token(keyword->token); + } + else + { + /* Such that we tokenize the second token in "empty greatest". */ + if(keyword->token != EMPTY) + setState(Default); + } + + if(keyword->token == AS || keyword->token == CASE) + setState(ItemType); + + return Token(keyword->token); + } + else + return id; + } + + Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator); + + /* + * This is hard. Consider this: + * + * Valid: child ::nameTest + * Valid: child:: nameTest + * Syntax Error: child :localName + * Syntax Error: child: localName + * + * Consider "child ::name". Right now, we're here: + * ^ + * We don't know whether "child" is a prefix and hence the whitespace is invalid, + * or whether it's an axis and hence skippable. */ + { + const int wsLength = peekForColonColon(); + /* We cannot call handleWhitespace() because it returns on + * END_OF_FILE, and we have parsed up keyword, and we need to + * deal with that. + * + * If we have a colon colon, which means the whitespace is + * allowed, we skip it. */ + if(wsLength != -1) + m_pos += wsLength; + } + + /* Handle name tests. */ + if(peekCurrent() == ':') + { + switch(peekAhead()) + { + case '=': + return id; + case '*': + { + m_pos += 2; + return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator); + } + case ':': + { + /* We have an axis. */ + setState(Axis); + return keyword ? Token(keyword->token) : id; + } + default: + { + /* It's a QName. */ + ++m_pos; /* Consume the colon. */ + + const Token id2(tokenizeNCName()); + + if(id2.type != NCNAME) + { + --m_pos; + return id; + } + + setState(Operator); + const int qNameLen = id.value.length() + id2.value.length() + 1; + return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen)); + } + } + } + + if(!keyword || isOperatorKeyword(keyword->token)) + { + setState(Operator); + return id; + } + + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) // TODO this should test for success. Write test. + return Token(ERROR); + + if(atEnd()) + { + setState(Operator); + return id; + } + + /* Let the if-body apply for constructors, and node type tests. */ + if(isTypeToken(keyword->token) || + keyword->token == TYPESWITCH || + keyword->token == ORDERED || + keyword->token == UNORDERED || + keyword->token == IF) + { + switch(peekCurrent()) + { + case '(': + { + // TODO See if we can remove DOCUMENT from isTypeToken. + if(isTypeToken(keyword->token) && keyword->token != DOCUMENT) + { + m_tokenStack.push(Token(LPAREN)); + ++m_pos; /* Consume '('. */ + pushState(Operator); + + if(keyword->token == PROCESSING_INSTRUCTION) + setState(KindTestForPI); + else + setState(KindTest); + + return Token(keyword->token); + } + else if(keyword->token == TYPESWITCH || keyword->token == IF) + return Token(keyword->token); + else /* It's a function call. */ + return id; + } + case '{': + { + m_tokenStack.push(Token(CURLY_LBRACE)); + ++m_pos; /* Consume '{'. */ + pushState(Operator); + /* Stay in state Default. */ + return Token(keyword->token); + } + default: + { + /* We have read in a token which is for instance + * "return", and now it can be an element + * test("element") a node kind test("element()"), or a + * computed element constructor("element name {..."). + * We need to do a two-token lookahead here, because + * "element return" can be an element test followed by + * the return keyword, but it can also be an element + * constructor("element return {"). */ + if(isNCNameStart(current())) + { + const int currentPos = m_pos; + const Token token2 = tokenizeNCNameOrQName(); + + if(token2.hasError()) + return token2; + + handleWhitespace(); + + if(peekCurrent() == '{') + { + /* An element constructor. */ + m_tokenStack.push(token2); + return Token(keyword->token); + } + + /* We jump back in the stream, we need to tokenize token2 according + * to the state. */ + m_pos = currentPos; + setState(Operator); + return Token(NCNAME, QLatin1String(keyword->name)); + } + } + } + } + + if(peekCurrent() == '$') + { + setState(VarName); + return Token(keyword->token); + } + + /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */ + if(peekCurrent() == '(') + return id; + else if(peekCurrent() == '{' && keyword->token == VALIDATE) + return Token(keyword->token); + + if(!isNCNameStart(current())) + { + setState(Operator); + return id; + } + + const Token id2(tokenizeNCName()); + const TokenMap *const keyword2 = lookupKeyword(id2.value); + + if(!keyword2) + { + /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */ + setState(Operator); + return id; + } + + switch(keyword->token) + { + case DECLARE: + { + switch(keyword2->token) + { + case VARIABLE: + /* Fallthrough. */ + case FUNCTION: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return Token(keyword->token); + } + case OPTION: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return Token(keyword->token); + } + case COPY_NAMESPACES: + /* Fallthrough. */ + case ORDERING: + { + m_tokenStack.push(Token(keyword2->token)); + setState(NamespaceKeyword); + return Token(keyword->token); + } + case CONSTRUCTION: + { + // TODO identical to CONSTRUCTION? + m_tokenStack.push(Token(keyword2->token)); + setState(Operator); + return Token(keyword->token); + } + case NAMESPACE: + /* Fallthrough. */ + case BASEURI: + { + m_tokenStack.push(Token(keyword2->token)); + setState(NamespaceDecl); + return Token(keyword->token); + } + case BOUNDARY_SPACE: + { + m_tokenStack.push(Token(keyword2->token)); + setState(XMLSpaceDecl); + return Token(keyword->token); + } + case DEFAULT: + { + m_tokenStack.push(Token(keyword2->token)); + + const TokenType ws2 = consumeWhitespace(); + if(ws2 != SUCCESS) + { + m_tokenStack.prepend(Token(ws2)); + return Token(keyword->token); + } + + const Token id3(tokenizeNCName()); + + if(id3.type != NCNAME) + { + m_tokenStack.prepend(id3); + return Token(keyword->token); + } + + const TokenMap *const keyword3 = lookupKeyword(id3.value); + if(!keyword3) + { + m_tokenStack.prepend(id3); + return Token(keyword->token); + } + else + { + m_tokenStack.prepend(Token(keyword3->token)); + + if(keyword3->token == ORDER) + setState(Operator); + else + setState(NamespaceDecl); + } + + return Token(keyword->token); + } + default: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return id; + } + } + } + case XQUERY: + { + m_tokenStack.push(Token(keyword2->token)); + + if(keyword2->token == VERSION) + { + setState(NamespaceDecl); + return Token(keyword->token); + } + else + { + setState(Operator); + return id; + } + } + case IMPORT: + { + m_tokenStack.push(Token(keyword2->token)); + + switch(keyword2->token) + { + case SCHEMA: + /* Fallthrough. */ + case MODULE: + { + setState(NamespaceKeyword); + return Token(keyword->token); + } + default: + { + setState(Operator); + return id; + } + } + } + case VALIDATE: + { + m_tokenStack.push(Token(keyword2->token)); + + switch(keyword2->token) + { + case LAX: + case STRICT: + { + pushState(Operator); + return Token(keyword->token); + } + default: + { + setState(Operator); + return id; + } + } + } + default: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Operator); + return id; + } + } + + Q_ASSERT(false); + + } + case VarName: + { + if(peekCurrent() == '$') + return tokenAndAdvance(DOLLAR); + + setState(Operator); + return tokenizeNCNameOrQName(); + Q_ASSERT(false); + } + case ItemType: + { + switch(peekCurrent()) + { + case '(': + return tokenAndChangeState(LPAREN, KindTest); + case '$': + return tokenAndChangeState(DOLLAR, VarName); + } + + const Token name(tokenizeNCNameOrQName()); + + if(name.hasError()) + return error(); + + else if(name.type == QNAME) + { + setState(OccurrenceIndicator); + return name; + } + else + { + const TokenMap *const keyword = lookupKeyword(name.value); + + if(keyword) + { + pushState(OccurrenceIndicator); + return Token(keyword->token); + } + else + { + setState(Default); + return name; + } + } + Q_ASSERT(false); + } + case KindTest: + { + switch(peekCurrent()) + { + case ')': + { + popState(); + return tokenAndAdvance(RPAREN); + } + case '(': + return tokenAndAdvance(LPAREN); + case ',': + return tokenAndAdvance(COMMA); + case '*': + return tokenAndAdvance(STAR); + case '?': + return tokenAndAdvance(QUESTION); + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + } + + const Token nc(tokenizeNCNameOrQName()); + if(nc.hasError()) + return nc; + + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) + return error(); + + if(peekCurrent() == '(') + { + const TokenMap *const keyword = lookupKeyword(nc.value); + if(keyword) + { + pushState(KindTest); + return Token(keyword->token); + } + else + return nc; + } + else + return nc; + Q_ASSERT(false); + } + case KindTestForPI: + { + switch(peekCurrent()) + { + case ')': + { + popState(); + return tokenAndAdvance(RPAREN); + } + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + default: + return tokenizeNCName(); + } + Q_ASSERT(false); + } + case OccurrenceIndicator: + { + switch(peekCurrent()) + { + case '?': + return tokenAndChangeState(QUESTION, Operator); + case '*': + return tokenAndChangeState(STAR, Operator); + case '+': + return tokenAndChangeState(PLUS, Operator); + default: + { + setState(Operator); + return nextToken(); + } + } + Q_ASSERT(false); + } + case XQueryVersion: + { + switch(peekCurrent()) + { + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + } + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + if(keyword) + return tokenAndChangeState(keyword->token, Default); + else + return id; + Q_ASSERT(false); + } + case StartTag: + { + if(peekAhead(-1) == '<') + { + if(current().isSpace()) + return Token(ERROR); + } + else + { + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + } + + switch(peekCurrent()) + { + case '/': + { + if(peekAhead() == '>') + { + m_pos += 2; + + if(m_scanOnly) + return Token(POSITION_SET); + else + { + popState(); + return Token(QUICK_TAG_END); + } + } + else + return error(); + } + case '>': + { + if(m_scanOnly) + return tokenAndChangeState(POSITION_SET, StartTag); + else + return tokenAndChangeState(G_GT, ElementContent); + } + case '=': + return tokenAndAdvance(G_EQ); + case '\'': + return tokenAndChangeState(APOS, AposAttributeContent); + case '"': + return tokenAndChangeState(QUOTE, QuotAttributeContent); + default: + return tokenizeNCNameOrQName(); + } + Q_ASSERT(false); + } + case AposAttributeContent: + /* Fallthrough. */ + case QuotAttributeContent: + { + const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"')); + QString result; + result.reserve(20); + + if(m_scanOnly) + { + int stack = 0; + return attributeAsRaw(sep, stack, m_pos, true, result); + } + + Q_ASSERT(!m_scanOnly); + while(true) + { + if(atEnd()) + { + /* In the case that the XSL-T tokenizer invokes us with + * default state QuotAttributeContent, we need to be able + * to return a single string, in case that is all we have + * accumulated. */ + if(result.isEmpty()) + return Token(END_OF_FILE); + else + return Token(STRING_LITERAL, result); + } + + const QChar curr(current()); + + if(curr == sep) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + + if(m_data.at(m_pos + 1) == sep) + { + /* The quoting mechanism was used. */ + m_pos += 2; + result.append(sep); + continue; + } + + const QChar next(m_data.at(m_pos + 1)); + if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>')) + return Token(ERROR); // i18n Space must separate attributes + else if(result.isEmpty()) + { + return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE, + StartTag, 1); + } + else + { + /* Don't consume the sep, but leave it so we next time return a token for it. */ + return Token(STRING_LITERAL, result); + } + + ++m_pos; + continue; + } + else if(curr == QLatin1Char('{')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '{') + { + ++m_pos; + result.append(QLatin1Char('{')); + } + else + { + if(result.isEmpty()) + { + /* The Attribute Value Template appeared directly in the attribute. */ + pushState(); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + else + { + /* We don't advance, keep '{' as next token. */ + return Token(STRING_LITERAL, result); + } + } + } + else if(curr == QLatin1Char('}')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '}') + { + ++m_pos; + result.append(QLatin1Char('}')); + } + else + return Token(ERROR); + } + else if(curr == QLatin1Char('&')) + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + result.append(ret); + } + else if(curr == QLatin1Char('<')) + return Token(STRING_LITERAL, result); + else + { + /* See Extensible Markup Language (XML) 1.0 (Fourth Edition), + * 3.3.3 Attribute-Value Normalization. + * + * However, it is complicated a bit by that AVN is defined on top of + * EOL normalization and we do those two in one go here. */ + switch(curr.unicode()) + { + case 0xD: + { + if(peekAhead() == '\n') + { + result.append(QLatin1Char(' ')); + ++m_pos; + break; + } + } + case 0xA: + /* Fallthrough. */ + case 0x9: + { + result.append(QLatin1Char(' ')); + break; + } + default: + result.append(curr); + } + } + + ++m_pos; + } + Q_ASSERT(false); + } + case ElementContent: + { + QString result; + result.reserve(20); + + /* Whether the text node, result, may be whitespace only. Character references + * and CDATA sections disables that. */ + bool mayBeWS = true; + + CharacterSkips skipEOLNormalization; + + while(true) + { + if(atEnd()) + return Token(END_OF_FILE); + + switch(peekCurrent()) + { + case '<': + { + if(!result.isEmpty() && peekAhead(2) != '[') + { + /* We encountered the end, and it was not a CDATA section. */ + /* We don't advance. Next time we'll handle the <... stuff. */ + return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); + } + + ++m_pos; + if(atEnd()) + return Token(END_OF_FILE); + + const QChar ahead(current()); + if(ahead.isSpace()) + return error(); + else if(ahead == QLatin1Char('/')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(m_data.at(m_pos + 1).isSpace()) + return error(); + else + return tokenAndChangeState(BEGIN_END_TAG, EndTag); + } + else if(isNCNameStart(ahead)) + { + pushState(); + return tokenAndChangeState(G_LT, StartTag, 0); + } + else if(aheadEquals("!--", 3, 0)) + { + pushState(); + m_pos += 3; + return tokenAndChangeState(COMMENT_START, XMLComment, 0); + } + else if(aheadEquals("![CDATA[", 8, 0)) + { + mayBeWS = false; + m_pos += 8; + const int start = m_pos; + const int len = scanUntil("]]>"); + + if(len == -1) + return Token(END_OF_FILE); + + m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */ + result.append(m_data.mid(start, len)); + break; + } + else if(ahead == QLatin1Char('?')) + { + pushState(); + return tokenAndChangeState(PI_START, ProcessingInstructionName); + } + else + return Token(G_LT); + } + case '&': + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + { + skipEOLNormalization.insert(result.count()); + result.append(ret); + mayBeWS = false; + break; + } + } + case '{': + { + // TODO remove this check, also below. + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '{') + { + ++m_pos; + result.append(QLatin1Char('{')); + } + else + { + if(result.isEmpty()) + { + pushState(); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + else + { + /* We don't advance here. */ + return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); + } + } + break; + } + case '}': + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '}') + { + ++m_pos; + result.append(QLatin1Char('}')); + } + else + { + /* This is a parse error, and the grammar won't be able + * to reduce this CURLY_RBRACE. */ + return tokenAndChangeState(CURLY_RBRACE, Default); + } + break; + } + case '\n': + { + /* We want to translate \r\n into \n. */ + if(peekAhead(-1) == '\r') + break; + /* else, fallthrough. */ + } + case '\r': + { + result.append(QLatin1Char('\n')); + break; + } + default: + { + result.append(current()); + break; + } + } + ++m_pos; + } + Q_ASSERT(false); + } + case ProcessingInstructionName: + { + const int start = m_pos; + + while(true) + { + ++m_pos; + if(m_pos >= m_length) + return Token(END_OF_FILE); + + const QChar next(current()); + if(next.isSpace() || next == QLatin1Char('?')) + { + return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start), + ProcessingInstructionContent); + } + } + Q_ASSERT(false); + } + case ProcessingInstructionContent: + { + /* Consume whitespace between the name and the content. */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + const int start = m_pos; + const int len = scanUntil("?>"); + + if(len == -1) + return Token(END_OF_FILE); + else + { + m_pos += 2; /* Consume "?>" */ + popState(); + return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); + } + Q_ASSERT(false); + } + case EndTag: + { + if(consumeRawWhitespace()) + return END_OF_FILE; + + if(peekCurrent() == '>') + { + popState(); + return tokenAndAdvance(G_GT); + } + else + return tokenizeNCNameOrQName(); + Q_ASSERT(false); + } + case XMLComment: + { + const int start = m_pos; + const int len = scanUntil("--"); + + if(len == -1) + return END_OF_FILE; + else + { + m_pos += 2; /* Consume "--". */ + popState(); + + if(peekCurrent() == '>') + { + ++m_pos; + return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); + } + else + return error(); + } + Q_ASSERT(false); + } + case Pragma: + { + /* Consume whitespace. */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + setState(PragmaContent); + return tokenizeNCNameOrQName(); + } + case PragmaContent: + { + QString result; + result.reserve(20); + + const bool hasWS = m_pos < m_length && current().isSpace(); + + /* Consume all whitespace up to the pragma content(if any). */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + if(peekCurrent() == '#' && peekAhead() == ')') + { + /* We reached the end, and there's no pragma content. */ + return tokenAndChangeState(PRAGMA_END, Default, 2); + } + else if(!hasWS) + { + /* A separating space is required if there's pragma content. */ + return error(); /* i18n */ + } + + const int start = m_pos; + const int len = scanUntil("#)"); + if(len == -1) + return Token(END_OF_FILE); + + return Token(STRING_LITERAL, m_data.mid(start, len)); + Q_ASSERT(false); + } + } + + Q_ASSERT(false); + return error(); +} + +Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep, + int &sepStack, + const int startPos, + const bool aInLiteral, + QString &result) +{ + bool inLiteral = aInLiteral; + const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"'); + + while(true) + { + if(atEnd()) + return END_OF_FILE; + + if(peekCurrent() == sep.unicode()) + { + if(inLiteral) + inLiteral = false; + else + inLiteral = true; + + if(peekAhead() == sep.unicode()) + { + /* The quoting mechanism was used. */ + result.append(current()); + m_pos += 2; + continue; + } + else + { + /* Don't consume the separator, such that we + * return a token for it next time. */ + if(m_pos == startPos) + { + ++m_pos; + setState(StartTag); + return Token(sep == QLatin1Char('"') ? QUOTE : APOS); + } + + + if(sepStack == 0) + { + return Token(STRING_LITERAL, result); + } + else + { + result.append(current()); + ++m_pos; + continue; + } + } + } + else if(peekCurrent() == '&') + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + { + result.append(ret); + ++m_pos; + continue; + } + } + else if(peekCurrent() == otherSep) + { + result.append(current()); + ++m_pos; + + if(peekCurrent() == otherSep) + ++m_pos; + + if(inLiteral) + inLiteral = false; + else + inLiteral = true; + + continue; + } + else if(peekCurrent() == '{') + { + result.append(current()); + + if(peekAhead() == '{') + { + m_pos += 2; + continue; + } + else + { + ++m_pos; + ++sepStack; + const Token t(attributeAsRaw(sep, sepStack, startPos, false, result)); + if(t.type != SUCCESS) + return t; + } + + } + else if(peekCurrent() == '}') + { + if(inLiteral && peekAhead() == '}') + { + result.append(current()); + m_pos += 2; + continue; + } + else + { + ++m_pos; + --sepStack; + return Token(SUCCESS); /* The return value is arbitrary. */ + } + } + else + { + result.append(current()); + ++m_pos; + } + } +} + +Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator) +{ + sourceLocator->first_line = m_line; + sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */ + + if(m_tokenStack.isEmpty()) + return nextToken(); + else + { + const Token retval(m_tokenStack.pop()); + + switch(retval.type) + { + case MODULE: + /* Fallthrough.*/ + case SCHEMA: + /* Fallthrough.*/ + case COPY_NAMESPACES: + { + setState(NamespaceKeyword); + break; + } + case VERSION: + { + setState(XQueryVersion); + break; + } + case AS: + /* Fallthrough. */ + case OF: + { + setState(ItemType); + break; + } + default: + { + if(isOperatorKeyword(retval.type)) + setState(Default); + + break; + } + }; + + return retval; + } +} + +int XQueryTokenizer::commenceScanOnly() +{ + m_scanOnly = true; + return m_pos; +} + +void XQueryTokenizer::resumeTokenizationFrom(const int pos) +{ + m_scanOnly = false; + m_pos = pos; +} + +void XQueryTokenizer::setParserContext(const ParserContext::Ptr &) +{ +} + +#undef handleWhitespace + +} // namespace QPatternist + +QT_END_NAMESPACE |