diff options
Diffstat (limited to 'src/xmlpatterns/parser/qxquerytokenizer.cpp')
-rw-r--r-- | src/xmlpatterns/parser/qxquerytokenizer.cpp | 2249 |
1 files changed, 2249 insertions, 0 deletions
diff --git a/src/xmlpatterns/parser/qxquerytokenizer.cpp b/src/xmlpatterns/parser/qxquerytokenizer.cpp new file mode 100644 index 0000000..7e96f13 --- /dev/null +++ b/src/xmlpatterns/parser/qxquerytokenizer.cpp @@ -0,0 +1,2249 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the QtXmlPatterns module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include <QByteArray> + +#include "qquerytransformparser_p.h" + +#include "qxquerytokenizer_p.h" + +#include "qtokenlookup.cpp" + +QT_BEGIN_NAMESPACE + +namespace QPatternist +{ + +#define handleWhitespace() \ +{ \ + const TokenType t = consumeWhitespace(); \ + if(t != SUCCESS) \ + return Token(t); \ +} + +XQueryTokenizer::XQueryTokenizer(const QString &query, + const QUrl &location, + const State startingState) : Tokenizer(location) + , m_data(query) + , m_length(query.length()) + , m_state(startingState) + , m_pos(0) + , m_line(1) + , m_columnOffset(0) + , m_scanOnly(false) +{ + Q_ASSERT(location.isValid() || location.isEmpty()); +} + +const QChar XQueryTokenizer::current() const +{ + if(m_pos < m_length) + return m_data.at(m_pos); + else + return QChar(); +} + +char XQueryTokenizer::peekCurrent() const +{ + return current().toAscii(); +} + +int XQueryTokenizer::peekForColonColon() const +{ + /* Note, we don't modify m_pos in this function, so we need to do offset + * calculations. */ + int pos = m_pos; + + while(pos < m_length) + { + switch(m_data.at(pos).toAscii()) + { + /* Fallthrough these four. */ + case ' ': + case '\t': + case '\n': + case '\r': + break; + case ':': + { + if(peekAhead((pos - m_pos) + 1) == ':') + return pos - m_pos; + /* Fallthrough. */ + } + default: + return -1; + } + ++pos; + } + + return -1; +} + +Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, + const State s, + const int advance) +{ + Q_ASSERT(advance >= 0); + m_pos += advance; + setState(s); + return Token(code); +} + +Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, + const QString &value, + const State s) +{ + setState(s); + return Token(code, value); +} + +Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code, + const int advance) +{ + Q_ASSERT(advance >= 0); + m_pos += advance; + return Token(code); +} + +QString XQueryTokenizer::normalizeEOL(const QString &input, + const CharacterSkips &characterSkips) +{ + const int len = input.count(); + QString result; + + /* The likely hood is rather high it'll be the same content. */ + result.reserve(len); + + for(int i = 0; i < len; ++i) + { + const QChar &at = input.at(i); + + if(characterSkips.contains(i)) + { + result.append(at); + continue; + } + switch(input.at(i).unicode()) + { + case '\r': + { + if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n')) + ++i; + + /* Else, fallthrough. */ + } + case '\n': + { + result.append(QLatin1Char('\n')); + continue; + } + default: + { + result.append(at); + } + } + } + + return result; +} + +Tokenizer::TokenType XQueryTokenizer::consumeComment() +{ + /* Below, we return ERROR instead of END_OF_FILE such that the parser + * sees an invalid comment. */ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ':': + { + ++m_pos; /* Consume ':' */ + if(atEnd()) + return ERROR; + + if(peekCurrent() == ')') + { + ++m_pos; /* Consume ')' */ + return SUCCESS; /* The comment closed nicely. */ + } + continue; /* We don't want to increment m_pos twice. */ + } + case '(': + { /* It looks like the start of a comment. */ + ++m_pos; + + if(atEnd()) + return END_OF_FILE; + else if(peekCurrent() == ':') + { + /* And it is a nested comment -- parse it. */ + const TokenType retval = consumeComment(); + if(retval == SUCCESS) + continue; /* Continue with our "own" comment. */ + else + return retval; /* Return the error in the nested comment. */ + } + break; + } + case '\n': + /* Fallthrough. */ + case '\r': + { + /* We want to count \r\n as a single line break. */ + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + } + ++m_pos; + } + + return ERROR; /* Error: we reached the end while inside a comment. */ +} + +bool XQueryTokenizer::consumeRawWhitespace() +{ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ' ': + case '\t': + break; + case '\n': + case '\r': + { + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + default: + return false; + } + ++m_pos; + } + return true; +} + +Tokenizer::TokenType XQueryTokenizer::consumeWhitespace() +{ + while(m_pos < m_length) + { + switch(peekCurrent()) + { + case ' ': + case '\t': + break; + case '\n': + case '\r': + { + /* We want to count \r\n as a single line break. */ + if(peekAhead() == '\n') + ++m_pos; + + m_columnOffset = m_pos; + ++m_line; + + break; + } + case '(': + { + if(peekAhead() == ':') + { + m_pos += 2; /* Consume "(:" */ + + const TokenType comment = consumeComment(); + if(comment == SUCCESS) + continue; + else + return comment; + } + } + default: + return SUCCESS; + } + ++m_pos; + } + + return END_OF_FILE; +} + +char XQueryTokenizer::peekAhead(const int length) const +{ + if(m_pos + length < m_length) + return m_data.at(m_pos + length).toAscii(); + else + return 0; +} + +Tokenizer::Token XQueryTokenizer::error() +{ + return Token(ERROR); +} + +bool XQueryTokenizer::isDigit(const char ch) +{ + return ch >= '0' && ch <= '9'; +} + +/* Replace with function in QXmlUtils. Write test cases for this. */ +bool XQueryTokenizer::isNCNameStart(const QChar ch) +{ + if(ch == QLatin1Char('_')) + return true; + + switch(ch.category()) + { + case QChar::Letter_Lowercase: + case QChar::Letter_Uppercase: + case QChar::Letter_Other: + case QChar::Letter_Titlecase: + case QChar::Number_Letter: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isNCNameBody(const QChar ch) +{ + switch(ch.unicode()) + { + case '.': + case '_': + case '-': + return true; + } + + switch(ch.category()) + { + case QChar::Letter_Lowercase: + case QChar::Letter_Uppercase: + case QChar::Letter_Other: + case QChar::Letter_Titlecase: + case QChar::Number_Letter: + case QChar::Mark_SpacingCombining: + case QChar::Mark_Enclosing: + case QChar::Mark_NonSpacing: + case QChar::Letter_Modifier: + case QChar::Number_DecimalDigit: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isPhraseKeyword(const TokenType code) +{ + switch(code) + { + /* Fallthrough all these. */ + case CASTABLE: + case CAST: + case COPY_NAMESPACES: + case DECLARE: + case EMPTY: + case MODULE: + case IMPORT: + case INSTANCE: + case ORDER: + case ORDERING: + case XQUERY: + case STABLE: + case TREAT: + return true; + default: + return false; + } +} + +bool XQueryTokenizer::isOperatorKeyword(const TokenType code) +{ + switch(code) + { + /* Fallthrough all these. */ + case AS: + case ASCENDING: + case AT: + case CASE: + case CAST: + case CASTABLE: + case EQ: + case EXTERNAL: + case GE: + case G_EQ: + case G_GT: + case G_LT: + case G_NE: + case GT: + case IN: + case INHERIT: + case INSTANCE: + case IS: + case ITEM: + case LE: + case LT: + case NE: + case NO_INHERIT: + case NO_PRESERVE: + case OF: + case PRESERVE: + case RETURN: + case STABLE: + case TO: + case TREAT: + return true; + default: + return false; + }; +} + +bool XQueryTokenizer::isTypeToken(const TokenType t) +{ + switch(t) + { + /* Fallthrough all these. */ + case ATTRIBUTE: + case COMMENT: + case DOCUMENT: + case DOCUMENT_NODE: + case ELEMENT: + case ITEM: + case NODE: + case PROCESSING_INSTRUCTION: + case SCHEMA_ATTRIBUTE: + case SCHEMA_ELEMENT: + case TEXT: + return true; + default: + return false; + } +} + +Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName() +{ + const int start = m_pos; + + const Token t1 = tokenizeNCName(); + if(t1.hasError()) + return t1; + + if(peekCurrent() != ':' || peekAhead() == '=') + return t1; + + ++m_pos; + + const Token t2 = tokenizeNCName(); + if(t2.hasError()) + return t2; + else + return Token(QNAME, m_data.mid(start, m_pos - start)); +} + +Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral() +{ + setState(Operator); + const int startPos = m_pos; + bool hasDot = false; + bool isXPath20 = false; + + for(; m_pos < m_length; ++m_pos) + { + QChar ch(current()); + + char cell = ch.cell(); + + if(cell == 'e' || cell == 'E') + { + isXPath20 = true; + ++m_pos; + ch = current(); + + if(ch.row() != 0) + break; + + cell = ch.cell(); + + if(cell == '+' || cell == '-') + continue; + } + + if(isNCNameStart(ch)) + return error(); + + if(cell < '0' || cell > '9') + { + if(cell == '.' && !hasDot) + hasDot = true; + else + break; + } + } + + return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos)); +} + +QString XQueryTokenizer::tokenizeCharacterReference() +{ + Q_ASSERT(peekCurrent() == '&'); + + const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1); + + if(theEnd == -1) /* No ';' found, a syntax error. i18n. */ + return QString(); + + QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1)); + m_pos = theEnd; + + const QChar charRef(charForReference(content)); + + if(!charRef.isNull()) + return charRef; + else if(content.startsWith(QLatin1Char('#'))) + { + int base; + + /* It is only '#' or '#x'. */ + if(content.length() < 2) + return QString(); + + /* We got a hex number if it starts with 'x', otherwise it's a decimal. */ + if(content.at(1) == QLatin1Char('x')) + { + base = 16; + content = content.mid(2); /* Remove "#x". */ + } + else + { + base = 10; + content = content.mid(1); /* Remove "#". */ + } + + bool conversionOK = false; + const int codepoint = content.toInt(&conversionOK, base); + + if(conversionOK) + { + const QChar ch(codepoint); + + if(ch.isNull()) + { + /* We likely have something which require surrogate pairs. */ + QString result; + result += QChar(QChar::highSurrogate(codepoint)); + result += QChar(QChar::lowSurrogate(codepoint)); + return result; + } + else + return ch; + } + else + return QString(); + } + else + return QString(); +} + +int XQueryTokenizer::scanUntil(const char *const content) +{ + const int end = m_data.indexOf(QString::fromLatin1(content), m_pos); + + if(end == -1) + return -1; + else + { + const int len = end - m_pos; + m_pos += len; + return len; + } +} + +QChar XQueryTokenizer::charForReference(const QString &reference) +{ + if(m_charRefs.isEmpty()) + { + /* Initialize. */ + m_charRefs.reserve(5); + m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<')); + m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>')); + m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&')); + m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"')); + m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\'')); + } + + return m_charRefs.value(reference); +} + +Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral() +{ + const QChar delimiter(current()); + /* We cannot unfortunately just scan and then do mid(), + * since we can encounter character references. */ + QString result; + + /* This is more likely than QString's default allocation. */ + result.reserve(8); + + CharacterSkips skipEOLNormalization; + + /* Advance over the initial quote character. */ + ++m_pos; + + for(; m_pos < m_length; ++m_pos) + { + const QChar c(current()); + + if(c == QLatin1Char('&')) + { + const QString charRef(tokenizeCharacterReference()); + + if(charRef.isNull()) + return error(); + else + { + skipEOLNormalization.insert(result.count()); + result.append(charRef); + } + + } + else if(c == delimiter) + { + /* Maybe the escaping mechanism is used. For instance, "s""s" + * has the value `s"s'. */ + ++m_pos; + + if(current() == delimiter) /* Double quote. */ + result += delimiter; + else + return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization)); + } + else + result += c; + } + + return error(); +} + +Tokenizer::Token XQueryTokenizer::tokenizeNCName() +{ + const int startPos = m_pos; + + if(m_pos < m_length && isNCNameStart(current())) + { + ++m_pos; + + for(; m_pos < m_length; ++m_pos) + { + if(!isNCNameBody(current())) + break; + } + + return Token(NCNAME, m_data.mid(startPos, m_pos - startPos)); + } + else + return error(); +} + +bool XQueryTokenizer::aheadEquals(const char *const chs, + const int len, + const int offset) const +{ + Q_ASSERT(len > 0); + Q_ASSERT(qstrlen(chs) == uint(len)); + + if(m_pos + len >= m_length) + return false; + + for(int i = offset; i < (len + offset); ++i) + { + if(m_data.at(m_pos + i).toAscii() != chs[i - offset]) + return false; + } + + return true; +} + +const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword) +{ + return TokenLookup::value(keyword.toAscii().constData(), keyword.length()); +} + +XQueryTokenizer::State XQueryTokenizer::state() const +{ + return m_state; +} + +void XQueryTokenizer::setState(const State s) +{ + m_state = s; +} + +void XQueryTokenizer::pushState(const State s) +{ + m_stateStack.push(s); +} + +void XQueryTokenizer::pushState() +{ + m_stateStack.push(m_state); +} + +void XQueryTokenizer::popState() +{ + /* QStack::pop() asserts if it's empty, so we need to check + * it, since we might receive unbalanced curlies. */ + if(!m_stateStack.isEmpty()) + m_state = m_stateStack.pop(); +} + +Tokenizer::Token XQueryTokenizer::nextToken() +{ + switch(state()) + { + /* We want to skip or do special whitespace handling for these + * states. So fallthrough all of the following. */ + case AposAttributeContent: + case Axis: + case ElementContent: + case EndTag: + case Pragma: + case PragmaContent: + case ProcessingInstructionName: + case QuotAttributeContent: + case StartTag: + case XMLComment: + break; + default: + handleWhitespace(); + } + + switch(state()) + { + case XMLSpaceDecl: + /* Fallthrough. */ + case NamespaceKeyword: + { + switch(peekCurrent()) + { + case ',': + return tokenAndAdvance(COMMA); + case '"': + /* Fallthrough. */ + case '\'': + { + setState(NamespaceDecl); + return tokenizeStringLiteral(); + } + } + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + if(keyword) + { + switch(keyword->token) + { + case INHERIT: + /* Fallthrough. */ + case NO_INHERIT: + { + setState(Default); + break; + } + case NAMESPACE: + { + setState(NamespaceDecl); + break; + } + case ORDERED: + /* Fallthrough. */ + case UNORDERED: + /* Fallthrough. */ + case STRIP: + { + setState(Default); + break; + } + case PRESERVE: + { + if(state() != NamespaceKeyword) + setState(Default); + } + default: + break; + } + + return Token(keyword->token); + } + else + return id; + + Q_ASSERT(false); + } + case NamespaceDecl: + { + switch(peekCurrent()) + { + case '=': + return tokenAndAdvance(G_EQ); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + case '\'': + /* Fallthrough. */ + case '\"': + return tokenizeStringLiteral(); + } + + const Token nc(tokenizeNCName()); + + handleWhitespace(); + + const char pc = peekCurrent(); + const TokenMap* const t = lookupKeyword(nc.value); + + if(pc == '\'' || (pc == '"' && t)) + return tokenAndChangeState(t->token, Default, 0); + else + return nc; + + Q_ASSERT(false); + } + case Axis: + { + if(peekCurrent() == ':') + { + Q_ASSERT(peekAhead() == ':'); + m_pos += 2; + setState(AfterAxisSeparator); + return Token(COLONCOLON); + } + /* Fallthrough. */ + } + case AfterAxisSeparator: + /* Fallthrough. */ + case Default: + /* State Operator and state Default have a lot of tokens in common except + * for minor differences. So we treat them the same way, and sprinkles logic + * here and there to handle the small differences. */ + /* Fallthrough. */ + case Operator: + { + switch(peekCurrent()) + { + case '=': + return tokenAndChangeState(G_EQ, Default); + case '-': + return tokenAndChangeState(MINUS, Default); + case '+': + return tokenAndChangeState(PLUS, Default); + case '[': + return tokenAndChangeState(LBRACKET, Default); + case ']': + return tokenAndChangeState(RBRACKET, Operator); + case ',': + return tokenAndChangeState(COMMA, Default); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + case '$': + return tokenAndChangeState(DOLLAR, VarName); + case '|': + return tokenAndChangeState(BAR, Default); + case '?': + return tokenAndChangeState(QUESTION, Operator); + case ')': + return tokenAndChangeState(RPAREN, Operator); + case '@': + return tokenAndChangeState(AT_SIGN, Default); + /* Fallthrough all these. */ + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '0': + return tokenizeNumberLiteral(); + case '.': + { + const char next = peekAhead(); + if(next == '.') + return tokenAndChangeState(DOTDOT, Operator, 2); + /* .5 is allowed, as short form for 0.5: + * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt> + */ + else if(isDigit(next)) + return tokenizeNumberLiteral(); + else + return tokenAndChangeState(DOT, Operator); + } + case '\'': + /* Fallthrough. */ + case '"': + { + setState(Operator); + return tokenizeStringLiteral(); + + } + case '(': + { + if(peekAhead() == '#') + return tokenAndChangeState(PRAGMA_START, Pragma, 2); + else + return tokenAndChangeState(LPAREN, Default); + } + case '*': + { + if(peekAhead() == ':') + { + m_pos += 2; /* Consume *:. */ + const Token nc = tokenizeNCName(); + + if(nc.hasError()) + return error(); + else + return tokenAndChangeState(ANY_PREFIX, nc.value, Operator); + } + else + return tokenAndChangeState(STAR, state() == Default ? Operator : Default); + } + case ':': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(ASSIGN, Default, 2); + case ':': + return tokenAndChangeState(COLONCOLON, Default, 2); + default: + return error(); + } + } + case '!': + { + if(peekAhead() == '=') + return tokenAndChangeState(G_NE, Default, 2); + else + return error(); + } + case '<': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(G_LE, Default, 2); + case '<': + return tokenAndChangeState(PRECEDES, Default, 2); + case '?': + { + pushState(Operator); + return tokenAndChangeState(PI_START, ProcessingInstructionName, 2); + } + case '!': + { + if(aheadEquals("!--", 3)) + { + m_pos += 3; /* Consume "!--". */ + pushState(Operator); + return tokenAndChangeState(COMMENT_START, XMLComment); + } + /* Fallthrough. It's a syntax error, and this is a good way to report it. */ + } + default: + { + if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1))) + { + /* We assume it's an element constructor. */ + pushState(Operator); + } + + return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag); + } + } + } + case '>': + { + switch(peekAhead()) + { + case '=': + return tokenAndChangeState(G_GE, Default, 2); + case '>': + return tokenAndChangeState(FOLLOWS, Default, 2); + default: + return tokenAndChangeState(G_GT, Default); + } + } + case '/': + { + if(peekAhead() == '/') + return tokenAndChangeState(SLASHSLASH, Default, 2); + else + return tokenAndChangeState(SLASH, Default); + } + case '{': + { + pushState(Operator); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + case '}': + { + popState(); + + return tokenAndAdvance(CURLY_RBRACE); + } + } + + /* Ok. We're in state Default or Operator, and it wasn't a simple + * character. */ + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + + if(state() == Operator) + { + if(keyword) + { + if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING) + setState(Operator); + else if(keyword->token == RETURN) + setState(Default); + else if(isPhraseKeyword(keyword->token)) + { + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) + return error(); + + const Token id2(tokenizeNCName()); + const TokenMap *const keyword2 = lookupKeyword(id2.value); + + if(keyword2) + { + if(keyword->token == TREAT && keyword2->token == AS) + setState(ItemType); + else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY) + setState(Default); + + m_tokenStack.push(Token(keyword2->token)); + } + else + m_tokenStack.push(id2); + + return Token(keyword->token); + } + else + { + /* Such that we tokenize the second token in "empty greatest". */ + if(keyword->token != EMPTY) + setState(Default); + } + + if(keyword->token == AS || keyword->token == CASE) + setState(ItemType); + + return Token(keyword->token); + } + else + return id; + } + + Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator); + + /* + * This is hard. Consider this: + * + * Valid: child ::nameTest + * Valid: child:: nameTest + * Syntax Error: child :localName + * Syntax Error: child: localName + * + * Consider "child ::name". Right now, we're here: + * ^ + * We don't know whether "child" is a prefix and hence the whitespace is invalid, + * or whether it's an axis and hence skippable. */ + { + const int wsLength = peekForColonColon(); + /* We cannot call handleWhitespace() because it returns on + * END_OF_FILE, and we have parsed up keyword, and we need to + * deal with that. + * + * If we have a colon colon, which means the whitespace is + * allowed, we skip it. */ + if(wsLength != -1) + m_pos += wsLength; + } + + /* Handle name tests. */ + if(peekCurrent() == ':') + { + switch(peekAhead()) + { + case '=': + return id; + case '*': + { + m_pos += 2; + return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator); + } + case ':': + { + /* We have an axis. */ + setState(Axis); + return keyword ? Token(keyword->token) : id; + } + default: + { + /* It's a QName. */ + ++m_pos; /* Consume the colon. */ + + const Token id2(tokenizeNCName()); + + if(id2.type != NCNAME) + { + --m_pos; + return id; + } + + setState(Operator); + const int qNameLen = id.value.length() + id2.value.length() + 1; + return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen)); + } + } + } + + if(!keyword || isOperatorKeyword(keyword->token)) + { + setState(Operator); + return id; + } + + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) // TODO this should test for success. Write test. + return Token(ERROR); + + if(atEnd()) + { + setState(Operator); + return id; + } + + /* Let the if-body apply for constructors, and node type tests. */ + if(isTypeToken(keyword->token) || + keyword->token == TYPESWITCH || + keyword->token == ORDERED || + keyword->token == UNORDERED || + keyword->token == IF) + { + switch(peekCurrent()) + { + case '(': + { + // TODO See if we can remove DOCUMENT from isTypeToken. + if(isTypeToken(keyword->token) && keyword->token != DOCUMENT) + { + m_tokenStack.push(Token(LPAREN)); + ++m_pos; /* Consume '('. */ + pushState(Operator); + + if(keyword->token == PROCESSING_INSTRUCTION) + setState(KindTestForPI); + else + setState(KindTest); + + return Token(keyword->token); + } + else if(keyword->token == TYPESWITCH || keyword->token == IF) + return Token(keyword->token); + else /* It's a function call. */ + return id; + } + case '{': + { + m_tokenStack.push(Token(CURLY_LBRACE)); + ++m_pos; /* Consume '{'. */ + pushState(Operator); + /* Stay in state Default. */ + return Token(keyword->token); + } + default: + { + /* We have read in a token which is for instance + * "return", and now it can be an element + * test("element") a node kind test("element()"), or a + * computed element constructor("element name {..."). + * We need to do a two-token lookahead here, because + * "element return" can be an element test followed by + * the return keyword, but it can also be an element + * constructor("element return {"). */ + if(isNCNameStart(current())) + { + const int currentPos = m_pos; + const Token token2 = tokenizeNCNameOrQName(); + + if(token2.hasError()) + return token2; + + handleWhitespace(); + + if(peekCurrent() == '{') + { + /* An element constructor. */ + m_tokenStack.push(token2); + return Token(keyword->token); + } + + /* We jump back in the stream, we need to tokenize token2 according + * to the state. */ + m_pos = currentPos; + setState(Operator); + return Token(NCNAME, QLatin1String(keyword->name)); + } + } + } + } + + if(peekCurrent() == '$') + { + setState(VarName); + return Token(keyword->token); + } + + /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */ + if(peekCurrent() == '(') + return id; + else if(peekCurrent() == '{' && keyword->token == VALIDATE) + return Token(keyword->token); + + if(!isNCNameStart(current())) + { + setState(Operator); + return id; + } + + const Token id2(tokenizeNCName()); + const TokenMap *const keyword2 = lookupKeyword(id2.value); + + if(!keyword2) + { + /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */ + setState(Operator); + return id; + } + + switch(keyword->token) + { + case DECLARE: + { + switch(keyword2->token) + { + case VARIABLE: + /* Fallthrough. */ + case FUNCTION: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return Token(keyword->token); + } + case OPTION: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return Token(keyword->token); + } + case COPY_NAMESPACES: + /* Fallthrough. */ + case ORDERING: + { + m_tokenStack.push(Token(keyword2->token)); + setState(NamespaceKeyword); + return Token(keyword->token); + } + case CONSTRUCTION: + { + // TODO identical to CONSTRUCTION? + m_tokenStack.push(Token(keyword2->token)); + setState(Operator); + return Token(keyword->token); + } + case NAMESPACE: + /* Fallthrough. */ + case BASEURI: + { + m_tokenStack.push(Token(keyword2->token)); + setState(NamespaceDecl); + return Token(keyword->token); + } + case BOUNDARY_SPACE: + { + m_tokenStack.push(Token(keyword2->token)); + setState(XMLSpaceDecl); + return Token(keyword->token); + } + case DEFAULT: + { + m_tokenStack.push(Token(keyword2->token)); + + const TokenType ws2 = consumeWhitespace(); + if(ws2 != SUCCESS) + { + m_tokenStack.prepend(Token(ws2)); + return Token(keyword->token); + } + + const Token id3(tokenizeNCName()); + + if(id3.type != NCNAME) + { + m_tokenStack.prepend(id3); + return Token(keyword->token); + } + + const TokenMap *const keyword3 = lookupKeyword(id3.value); + if(!keyword3) + { + m_tokenStack.prepend(id3); + return Token(keyword->token); + } + else + { + m_tokenStack.prepend(Token(keyword3->token)); + + if(keyword3->token == ORDER) + setState(Operator); + else + setState(NamespaceDecl); + } + + return Token(keyword->token); + } + default: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Default); + return id; + } + } + } + case XQUERY: + { + m_tokenStack.push(Token(keyword2->token)); + + if(keyword2->token == VERSION) + { + setState(NamespaceDecl); + return Token(keyword->token); + } + else + { + setState(Operator); + return id; + } + } + case IMPORT: + { + m_tokenStack.push(Token(keyword2->token)); + + switch(keyword2->token) + { + case SCHEMA: + /* Fallthrough. */ + case MODULE: + { + setState(NamespaceKeyword); + return Token(keyword->token); + } + default: + { + setState(Operator); + return id; + } + } + } + case VALIDATE: + { + m_tokenStack.push(Token(keyword2->token)); + + switch(keyword2->token) + { + case LAX: + case STRICT: + { + pushState(Operator); + return Token(keyword->token); + } + default: + { + setState(Operator); + return id; + } + } + } + default: + { + m_tokenStack.push(Token(keyword2->token)); + setState(Operator); + return id; + } + } + + Q_ASSERT(false); + + } + case VarName: + { + if(peekCurrent() == '$') + return tokenAndAdvance(DOLLAR); + + setState(Operator); + return tokenizeNCNameOrQName(); + Q_ASSERT(false); + } + case ItemType: + { + switch(peekCurrent()) + { + case '(': + return tokenAndChangeState(LPAREN, KindTest); + case '$': + return tokenAndChangeState(DOLLAR, VarName); + } + + const Token name(tokenizeNCNameOrQName()); + + if(name.hasError()) + return error(); + + else if(name.type == QNAME) + { + setState(OccurrenceIndicator); + return name; + } + else + { + const TokenMap *const keyword = lookupKeyword(name.value); + + if(keyword) + { + pushState(OccurrenceIndicator); + return Token(keyword->token); + } + else + { + setState(Default); + return name; + } + } + Q_ASSERT(false); + } + case KindTest: + { + switch(peekCurrent()) + { + case ')': + { + popState(); + return tokenAndAdvance(RPAREN); + } + case '(': + return tokenAndAdvance(LPAREN); + case ',': + return tokenAndAdvance(COMMA); + case '*': + return tokenAndAdvance(STAR); + case '?': + return tokenAndAdvance(QUESTION); + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + } + + const Token nc(tokenizeNCNameOrQName()); + if(nc.hasError()) + return nc; + + const TokenType ws = consumeWhitespace(); + if(ws == ERROR) + return error(); + + if(peekCurrent() == '(') + { + const TokenMap *const keyword = lookupKeyword(nc.value); + if(keyword) + { + pushState(KindTest); + return Token(keyword->token); + } + else + return nc; + } + else + return nc; + Q_ASSERT(false); + } + case KindTestForPI: + { + switch(peekCurrent()) + { + case ')': + { + popState(); + return tokenAndAdvance(RPAREN); + } + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + default: + return tokenizeNCName(); + } + Q_ASSERT(false); + } + case OccurrenceIndicator: + { + switch(peekCurrent()) + { + case '?': + return tokenAndChangeState(QUESTION, Operator); + case '*': + return tokenAndChangeState(STAR, Operator); + case '+': + return tokenAndChangeState(PLUS, Operator); + default: + { + setState(Operator); + return nextToken(); + } + } + Q_ASSERT(false); + } + case XQueryVersion: + { + switch(peekCurrent()) + { + case '\'': + /* Fallthrough. */ + case '"': + return tokenizeStringLiteral(); + case ';': + return tokenAndChangeState(SEMI_COLON, Default); + } + + const Token id(tokenizeNCName()); + + if(id.type != NCNAME) + return id; + + const TokenMap *const keyword = lookupKeyword(id.value); + if(keyword) + return tokenAndChangeState(keyword->token, Default); + else + return id; + Q_ASSERT(false); + } + case StartTag: + { + if(peekAhead(-1) == '<') + { + if(current().isSpace()) + return Token(ERROR); + } + else + { + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + } + + switch(peekCurrent()) + { + case '/': + { + if(peekAhead() == '>') + { + m_pos += 2; + + if(m_scanOnly) + return Token(POSITION_SET); + else + { + popState(); + return Token(QUICK_TAG_END); + } + } + else + return error(); + } + case '>': + { + if(m_scanOnly) + return tokenAndChangeState(POSITION_SET, StartTag); + else + return tokenAndChangeState(G_GT, ElementContent); + } + case '=': + return tokenAndAdvance(G_EQ); + case '\'': + return tokenAndChangeState(APOS, AposAttributeContent); + case '"': + return tokenAndChangeState(QUOTE, QuotAttributeContent); + default: + return tokenizeNCNameOrQName(); + } + Q_ASSERT(false); + } + case AposAttributeContent: + /* Fallthrough. */ + case QuotAttributeContent: + { + const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"')); + QString result; + result.reserve(20); + + if(m_scanOnly) + { + int stack = 0; + return attributeAsRaw(sep, stack, m_pos, true, result); + } + + Q_ASSERT(!m_scanOnly); + while(true) + { + if(atEnd()) + { + /* In the case that the XSL-T tokenizer invokes us with + * default state QuotAttributeContent, we need to be able + * to return a single string, in case that is all we have + * accumulated. */ + if(result.isEmpty()) + return Token(END_OF_FILE); + else + return Token(STRING_LITERAL, result); + } + + const QChar curr(current()); + + if(curr == sep) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + + if(m_data.at(m_pos + 1) == sep) + { + /* The quoting mechanism was used. */ + m_pos += 2; + result.append(sep); + continue; + } + + const QChar next(m_data.at(m_pos + 1)); + if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>')) + return Token(ERROR); // i18n Space must separate attributes + else if(result.isEmpty()) + { + return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE, + StartTag, 1); + } + else + { + /* Don't consume the sep, but leave it so we next time return a token for it. */ + return Token(STRING_LITERAL, result); + } + + ++m_pos; + continue; + } + else if(curr == QLatin1Char('{')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '{') + { + ++m_pos; + result.append(QLatin1Char('{')); + } + else + { + if(result.isEmpty()) + { + /* The Attribute Value Template appeared directly in the attribute. */ + pushState(); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + else + { + /* We don't advance, keep '{' as next token. */ + return Token(STRING_LITERAL, result); + } + } + } + else if(curr == QLatin1Char('}')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '}') + { + ++m_pos; + result.append(QLatin1Char('}')); + } + else + return Token(ERROR); + } + else if(curr == QLatin1Char('&')) + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + result.append(ret); + } + else if(curr == QLatin1Char('<')) + return Token(STRING_LITERAL, result); + else + { + /* See Extensible Markup Language (XML) 1.0 (Fourth Edition), + * 3.3.3 Attribute-Value Normalization. + * + * However, it is complicated a bit by that AVN is defined on top of + * EOL normalization and we do those two in one go here. */ + switch(curr.unicode()) + { + case 0xD: + { + if(peekAhead() == '\n') + { + result.append(QLatin1Char(' ')); + ++m_pos; + break; + } + } + case 0xA: + /* Fallthrough. */ + case 0x9: + { + result.append(QLatin1Char(' ')); + break; + } + default: + result.append(curr); + } + } + + ++m_pos; + } + Q_ASSERT(false); + } + case ElementContent: + { + QString result; + result.reserve(20); + + /* Whether the text node, result, may be whitespace only. Character references + * and CDATA sections disables that. */ + bool mayBeWS = true; + + CharacterSkips skipEOLNormalization; + + while(true) + { + if(atEnd()) + return Token(END_OF_FILE); + + switch(peekCurrent()) + { + case '<': + { + if(!result.isEmpty() && peekAhead(2) != '[') + { + /* We encountered the end, and it was not a CDATA section. */ + /* We don't advance. Next time we'll handle the <... stuff. */ + return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); + } + + ++m_pos; + if(atEnd()) + return Token(END_OF_FILE); + + const QChar ahead(current()); + if(ahead.isSpace()) + return error(); + else if(ahead == QLatin1Char('/')) + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(m_data.at(m_pos + 1).isSpace()) + return error(); + else + return tokenAndChangeState(BEGIN_END_TAG, EndTag); + } + else if(isNCNameStart(ahead)) + { + pushState(); + return tokenAndChangeState(G_LT, StartTag, 0); + } + else if(aheadEquals("!--", 3, 0)) + { + pushState(); + m_pos += 3; + return tokenAndChangeState(COMMENT_START, XMLComment, 0); + } + else if(aheadEquals("![CDATA[", 8, 0)) + { + mayBeWS = false; + m_pos += 8; + const int start = m_pos; + const int len = scanUntil("]]>"); + + if(len == -1) + return Token(END_OF_FILE); + + m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */ + result.append(m_data.mid(start, len)); + break; + } + else if(ahead == QLatin1Char('?')) + { + pushState(); + return tokenAndChangeState(PI_START, ProcessingInstructionName); + } + else + return Token(G_LT); + } + case '&': + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + { + skipEOLNormalization.insert(result.count()); + result.append(ret); + mayBeWS = false; + break; + } + } + case '{': + { + // TODO remove this check, also below. + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '{') + { + ++m_pos; + result.append(QLatin1Char('{')); + } + else + { + if(result.isEmpty()) + { + pushState(); + return tokenAndChangeState(CURLY_LBRACE, Default); + } + else + { + /* We don't advance here. */ + return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization)); + } + } + break; + } + case '}': + { + if(m_pos + 1 == m_length) + return Token(END_OF_FILE); + else if(peekAhead() == '}') + { + ++m_pos; + result.append(QLatin1Char('}')); + } + else + { + /* This is a parse error, and the grammar won't be able + * to reduce this CURLY_RBRACE. */ + return tokenAndChangeState(CURLY_RBRACE, Default); + } + break; + } + case '\n': + { + /* We want to translate \r\n into \n. */ + if(peekAhead(-1) == '\r') + break; + /* else, fallthrough. */ + } + case '\r': + { + result.append(QLatin1Char('\n')); + break; + } + default: + { + result.append(current()); + break; + } + } + ++m_pos; + } + Q_ASSERT(false); + } + case ProcessingInstructionName: + { + const int start = m_pos; + + while(true) + { + ++m_pos; + if(m_pos >= m_length) + return Token(END_OF_FILE); + + const QChar next(current()); + if(next.isSpace() || next == QLatin1Char('?')) + { + return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start), + ProcessingInstructionContent); + } + } + Q_ASSERT(false); + } + case ProcessingInstructionContent: + { + /* Consume whitespace between the name and the content. */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + const int start = m_pos; + const int len = scanUntil("?>"); + + if(len == -1) + return Token(END_OF_FILE); + else + { + m_pos += 2; /* Consume "?>" */ + popState(); + return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); + } + Q_ASSERT(false); + } + case EndTag: + { + if(consumeRawWhitespace()) + return END_OF_FILE; + + if(peekCurrent() == '>') + { + popState(); + return tokenAndAdvance(G_GT); + } + else + return tokenizeNCNameOrQName(); + Q_ASSERT(false); + } + case XMLComment: + { + const int start = m_pos; + const int len = scanUntil("--"); + + if(len == -1) + return END_OF_FILE; + else + { + m_pos += 2; /* Consume "--". */ + popState(); + + if(peekCurrent() == '>') + { + ++m_pos; + return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips())); + } + else + return error(); + } + Q_ASSERT(false); + } + case Pragma: + { + /* Consume whitespace. */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + setState(PragmaContent); + return tokenizeNCNameOrQName(); + } + case PragmaContent: + { + QString result; + result.reserve(20); + + const bool hasWS = m_pos < m_length && current().isSpace(); + + /* Consume all whitespace up to the pragma content(if any). */ + if(consumeRawWhitespace()) + return Token(END_OF_FILE); + + if(peekCurrent() == '#' && peekAhead() == ')') + { + /* We reached the end, and there's no pragma content. */ + return tokenAndChangeState(PRAGMA_END, Default, 2); + } + else if(!hasWS) + { + /* A separating space is required if there's pragma content. */ + return error(); /* i18n */ + } + + const int start = m_pos; + const int len = scanUntil("#)"); + if(len == -1) + return Token(END_OF_FILE); + + return Token(STRING_LITERAL, m_data.mid(start, len)); + Q_ASSERT(false); + } + } + + Q_ASSERT(false); + return error(); +} + +Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep, + int &sepStack, + const int startPos, + const bool aInLiteral, + QString &result) +{ + bool inLiteral = aInLiteral; + const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"'); + + while(true) + { + if(atEnd()) + return END_OF_FILE; + + if(peekCurrent() == sep.unicode()) + { + if(inLiteral) + inLiteral = false; + else + inLiteral = true; + + if(peekAhead() == sep.unicode()) + { + /* The quoting mechanism was used. */ + result.append(current()); + m_pos += 2; + continue; + } + else + { + /* Don't consume the separator, such that we + * return a token for it next time. */ + if(m_pos == startPos) + { + ++m_pos; + setState(StartTag); + return Token(sep == QLatin1Char('"') ? QUOTE : APOS); + } + + + if(sepStack == 0) + { + return Token(STRING_LITERAL, result); + } + else + { + result.append(current()); + ++m_pos; + continue; + } + } + } + else if(peekCurrent() == '&') + { + const QString ret(tokenizeCharacterReference()); + if(ret.isNull()) + return Token(ERROR); + else + { + result.append(ret); + ++m_pos; + continue; + } + } + else if(peekCurrent() == otherSep) + { + result.append(current()); + ++m_pos; + + if(peekCurrent() == otherSep) + ++m_pos; + + if(inLiteral) + inLiteral = false; + else + inLiteral = true; + + continue; + } + else if(peekCurrent() == '{') + { + result.append(current()); + + if(peekAhead() == '{') + { + m_pos += 2; + continue; + } + else + { + ++m_pos; + ++sepStack; + const Token t(attributeAsRaw(sep, sepStack, startPos, false, result)); + if(t.type != SUCCESS) + return t; + } + + } + else if(peekCurrent() == '}') + { + if(inLiteral && peekAhead() == '}') + { + result.append(current()); + m_pos += 2; + continue; + } + else + { + ++m_pos; + --sepStack; + return Token(SUCCESS); /* The return value is arbitrary. */ + } + } + else + { + result.append(current()); + ++m_pos; + } + } +} + +Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator) +{ + sourceLocator->first_line = m_line; + sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */ + + if(m_tokenStack.isEmpty()) + return nextToken(); + else + { + const Token retval(m_tokenStack.pop()); + + switch(retval.type) + { + case MODULE: + /* Fallthrough.*/ + case SCHEMA: + /* Fallthrough.*/ + case COPY_NAMESPACES: + { + setState(NamespaceKeyword); + break; + } + case VERSION: + { + setState(XQueryVersion); + break; + } + case AS: + /* Fallthrough. */ + case OF: + { + setState(ItemType); + break; + } + default: + { + if(isOperatorKeyword(retval.type)) + setState(Default); + + break; + } + }; + + return retval; + } +} + +int XQueryTokenizer::commenceScanOnly() +{ + m_scanOnly = true; + return m_pos; +} + +void XQueryTokenizer::resumeTokenizationFrom(const int pos) +{ + m_scanOnly = false; + m_pos = pos; +} + +void XQueryTokenizer::setParserContext(const ParserContext::Ptr &) +{ +} + +#undef handleWhitespace + +} // namespace QPatternist + +QT_END_NAMESPACE |