/**************************************************************************** ** ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). ** All rights reserved. ** Contact: Nokia Corporation (qt-info@nokia.com) ** ** This file is part of the tools applications of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** No Commercial Usage ** This file contains pre-release code and may not be distributed. ** You may use this file in accordance with the terms and conditions ** contained in the Technology Preview License Agreement accompanying ** this package. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 2.1 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 2.1 requirements ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. ** ** In addition, as a special exception, Nokia gives you certain additional ** rights. These rights are described in the Nokia Qt LGPL Exception ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. ** ** If you have questions regarding the use of this file, please contact ** Nokia at qt-info@nokia.com. ** ** ** ** ** ** ** ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #include "config.h" #include "tokenizer.h" #include #include #include #include #include #include #include #include QT_BEGIN_NAMESPACE #define LANGUAGE_CPP "Cpp" /* qmake ignore Q_OBJECT */ /* Keep in sync with tokenizer.h. */ static const char *kwords[] = { "char", "class", "const", "double", "enum", "explicit", "friend", "inline", "int", "long", "namespace", "operator", "private", "protected", "public", "short", "signals", "signed", "slots", "static", "struct", "template", "typedef", "typename", "union", "unsigned", "using", "virtual", "void", "volatile", "__int64", "Q_OBJECT", "Q_OVERRIDE", "Q_PROPERTY", "Q_PRIVATE_PROPERTY", "Q_DECLARE_SEQUENTIAL_ITERATOR", "Q_DECLARE_MUTABLE_SEQUENTIAL_ITERATOR", "Q_DECLARE_ASSOCIATIVE_ITERATOR", "Q_DECLARE_MUTABLE_ASSOCIATIVE_ITERATOR", "Q_DECLARE_FLAGS", "Q_SIGNALS", "Q_SLOTS", "QT_COMPAT", "QT_COMPAT_CONSTRUCTOR", "QT_DEPRECATED", "QT_MOC_COMPAT", "QT_MODULE", "QT3_SUPPORT", "QT3_SUPPORT_CONSTRUCTOR", "QT3_MOC_SUPPORT", "QDOC_PROPERTY" }; static const int KwordHashTableSize = 4096; static int kwordHashTable[KwordHashTableSize]; static QHash *ignoredTokensAndDirectives = 0; static QRegExp *comment = 0; static QRegExp *versionX = 0; static QRegExp *definedX = 0; static QRegExp *defines = 0; static QRegExp *falsehoods = 0; static QTextCodec *sourceCodec = 0; /* This function is a perfect hash function for the 37 keywords of C99 (with a hash table size of 512). It should perform well on our Qt-enhanced C++ subset. */ static int hashKword(const char *s, int len) { return (((uchar) s[0]) + (((uchar) s[2]) << 5) + (((uchar) s[len - 1]) << 3)) % KwordHashTableSize; } static void insertKwordIntoHash(const char *s, int number) { int k = hashKword(s, strlen(s)); while (kwordHashTable[k]) { if (++k == KwordHashTableSize) k = 0; } kwordHashTable[k] = number; } Tokenizer::Tokenizer(const Location& loc, QFile &in) { init(); yyIn = in.readAll(); yyPos = 0; start(loc); } Tokenizer::Tokenizer(const Location& loc, const QByteArray &in) : yyIn(in) { init(); yyPos = 0; start(loc); } Tokenizer::~Tokenizer() { delete[] yyLexBuf1; delete[] yyLexBuf2; } int Tokenizer::getToken() { char *t = yyPrevLex; yyPrevLex = yyLex; yyLex = t; while (yyCh != EOF) { yyTokLoc = yyCurLoc; yyLexLen = 0; if (isspace(yyCh)) { do { yyCh = getChar(); } while (isspace(yyCh)); } else if (isalpha(yyCh) || yyCh == '_') { do { yyCh = getChar(); } while (isalnum(yyCh) || yyCh == '_'); int k = hashKword(yyLex, yyLexLen); for (;;) { int i = kwordHashTable[k]; if (i == 0) { return Tok_Ident; } else if (i == -1) { if (!parsingMacro && ignoredTokensAndDirectives->contains(yyLex)) { if (ignoredTokensAndDirectives->value(yyLex)) { // it's a directive int parenDepth = 0; while (yyCh != EOF && (yyCh != ')' || parenDepth > 1)) { if (yyCh == '(') ++parenDepth; else if (yyCh == ')') --parenDepth; yyCh = getChar(); } if (yyCh == ')') yyCh = getChar(); } break; } } else if (strcmp(yyLex, kwords[i - 1]) == 0) { int ret = (int) Tok_FirstKeyword + i - 1; if (ret != Tok_explicit && ret != Tok_inline && ret != Tok_typename) return ret; break; } if (++k == KwordHashTableSize) k = 0; } } else if (isdigit(yyCh)) { do { yyCh = getChar(); } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || yyCh == '-'); return Tok_Number; } else { switch (yyCh) { case '!': case '%': yyCh = getChar(); if (yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; case '"': yyCh = getChar(); while (yyCh != EOF && yyCh != '"') { if (yyCh == '\\') yyCh = getChar(); yyCh = getChar(); } yyCh = getChar(); if (yyCh == EOF) yyTokLoc.warning(tr("Unterminated C++ string literal"), tr("Maybe you forgot '/*!' at the beginning of the file?")); else return Tok_String; break; case '#': return getTokenAfterPreprocessor(); case '&': yyCh = getChar(); if (yyCh == '&' || yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_Ampersand; } case '\'': yyCh = getChar(); if (yyCh == '\\') yyCh = getChar(); do { yyCh = getChar(); } while (yyCh != EOF && yyCh != '\''); if (yyCh == EOF) { yyTokLoc.warning(tr("Unterminated C++ character" " literal")); } else { yyCh = getChar(); return Tok_Number; } break; case '(': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyParenDepth++; if (isspace(yyCh)) { do { yyCh = getChar(); } while (isspace(yyCh)); yyLexLen = 1; yyLex[1] = '\0'; } if (yyCh == '*') { yyCh = getChar(); return Tok_LeftParenAster; } return Tok_LeftParen; case ')': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyParenDepth--; return Tok_RightParen; case '*': yyCh = getChar(); if (yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_Aster; } case '^': yyCh = getChar(); if (yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_Caret; } case '+': yyCh = getChar(); if (yyCh == '+' || yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; case ',': yyCh = getChar(); return Tok_Comma; case '-': yyCh = getChar(); if (yyCh == '-' || yyCh == '=') { yyCh = getChar(); } else if (yyCh == '>') { yyCh = getChar(); if (yyCh == '*') yyCh = getChar(); } return Tok_SomeOperator; case '.': yyCh = getChar(); if (yyCh == '*') { yyCh = getChar(); } else if (yyCh == '.') { do { yyCh = getChar(); } while (yyCh == '.'); return Tok_Ellipsis; } else if (isdigit(yyCh)) { do { yyCh = getChar(); } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || yyCh == '-'); return Tok_Number; } return Tok_SomeOperator; case '/': yyCh = getChar(); if (yyCh == '/') { do { yyCh = getChar(); } while (yyCh != EOF && yyCh != '\n'); } else if (yyCh == '*') { bool metDoc = false; // empty doc is no doc bool metSlashAsterBang = false; bool metAster = false; bool metAsterSlash = false; yyCh = getChar(); if (yyCh == '!') metSlashAsterBang = true; while (!metAsterSlash) { if (yyCh == EOF) { yyTokLoc.warning(tr("Unterminated C++ comment")); break; } else { if (yyCh == '*') { metAster = true; } else if (metAster && yyCh == '/') { metAsterSlash = true; } else { metAster = false; if (isgraph(yyCh)) metDoc = true; } } yyCh = getChar(); } if (metSlashAsterBang && metDoc) return Tok_Doc; else if (yyParenDepth > 0) return Tok_Comment; } else { if (yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; } break; case ':': yyCh = getChar(); if (yyCh == ':') { yyCh = getChar(); return Tok_Gulbrandsen; } else { return Tok_Colon; } case ';': yyCh = getChar(); return Tok_Semicolon; case '<': yyCh = getChar(); if (yyCh == '<') { yyCh = getChar(); if (yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; } else if (yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_LeftAngle; } case '=': yyCh = getChar(); if (yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_Equal; } case '>': yyCh = getChar(); if (yyCh == '>') { yyCh = getChar(); if (yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; } else if (yyCh == '=') { yyCh = getChar(); return Tok_SomeOperator; } else { return Tok_RightAngle; } case '?': yyCh = getChar(); return Tok_SomeOperator; case '[': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyBracketDepth++; return Tok_LeftBracket; case '\\': yyCh = getChar(); yyCh = getChar(); // skip one character break; case ']': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyBracketDepth--; return Tok_RightBracket; case '{': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyBraceDepth++; return Tok_LeftBrace; case '}': yyCh = getChar(); if (yyNumPreprocessorSkipping == 0) yyBraceDepth--; return Tok_RightBrace; case '|': yyCh = getChar(); if (yyCh == '|' || yyCh == '=') yyCh = getChar(); return Tok_SomeOperator; case '~': yyCh = getChar(); return Tok_Tilde; case '@': yyCh = getChar(); return Tok_At; default: // ### We should really prevent qdoc from looking at snippet files rather than // ### suppress warnings when reading them. if (yyNumPreprocessorSkipping == 0 && !yyTokLoc.fileName().endsWith(".qdoc")) { yyTokLoc.warning(tr("Hostile character 0x%1 in C++ source") .arg((uchar)yyCh, 1, 16)); } yyCh = getChar(); } } } if (yyPreprocessorSkipping.count() > 1) { yyTokLoc.warning(tr("Expected #endif before end of file")); // clear it out or we get an infinite loop! while (!yyPreprocessorSkipping.isEmpty()) { popSkipping(); } } strcpy(yyLex, "end-of-input"); yyLexLen = strlen(yyLex); return Tok_Eoi; } void Tokenizer::initialize(const Config &config) { QString versionSym = config.getString(CONFIG_VERSIONSYM); QString sourceEncoding = config.getString(CONFIG_SOURCEENCODING); if (sourceEncoding.isEmpty()) sourceEncoding = QLatin1String("ISO-8859-1"); sourceCodec = QTextCodec::codecForName(sourceEncoding.toLocal8Bit()); comment = new QRegExp("/(?:\\*.*\\*/|/.*\n|/[^\n]*$)"); comment->setMinimal(true); versionX = new QRegExp("$cannot possibly match^"); if (!versionSym.isEmpty()) versionX->setPattern("[ \t]*(?:" + QRegExp::escape(versionSym) + ")[ \t]+\"([^\"]*)\"[ \t]*"); definedX = new QRegExp("defined ?\\(?([A-Z_0-9a-z]+) ?\\)"); QStringList d = config.getStringList(CONFIG_DEFINES); d += "qdoc"; defines = new QRegExp(d.join("|")); falsehoods = new QRegExp(config.getStringList(CONFIG_FALSEHOODS).join("|")); memset(kwordHashTable, 0, sizeof(kwordHashTable)); for (int i = 0; i < Tok_LastKeyword - Tok_FirstKeyword + 1; i++) insertKwordIntoHash(kwords[i], i + 1); ignoredTokensAndDirectives = new QHash; QStringList tokens = config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNORETOKENS); foreach (const QString &t, tokens) { const QByteArray tb = t.toAscii(); ignoredTokensAndDirectives->insert(tb, false); insertKwordIntoHash(tb.data(), -1); } QStringList directives = config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNOREDIRECTIVES); foreach (const QString &d, directives) { const QByteArray db = d.toAscii(); ignoredTokensAndDirectives->insert(db, true); insertKwordIntoHash(db.data(), -1); } } void Tokenizer::terminate() { delete comment; comment = 0; delete versionX; versionX = 0; delete definedX; definedX = 0; delete defines; defines = 0; delete falsehoods; falsehoods = 0; delete ignoredTokensAndDirectives; ignoredTokensAndDirectives = 0; } void Tokenizer::init() { yyLexBuf1 = new char[(int) yyLexBufSize]; yyLexBuf2 = new char[(int) yyLexBufSize]; yyPrevLex = yyLexBuf1; yyPrevLex[0] = '\0'; yyLex = yyLexBuf2; yyLex[0] = '\0'; yyLexLen = 0; yyPreprocessorSkipping.push(false); yyNumPreprocessorSkipping = 0; yyBraceDepth = 0; yyParenDepth = 0; yyBracketDepth = 0; yyCh = '\0'; parsingMacro = false; } void Tokenizer::start(const Location& loc) { yyTokLoc = loc; yyCurLoc = loc; yyCurLoc.start(); strcpy(yyPrevLex, "beginning-of-input"); strcpy(yyLex, "beginning-of-input"); yyLexLen = strlen(yyLex); yyBraceDepth = 0; yyParenDepth = 0; yyBracketDepth = 0; yyCh = '\0'; yyCh = getChar(); } /* Returns the next token, if # was met. This function interprets the preprocessor directive, skips over any #ifdef'd out tokens, and returns the token after all of that. */ int Tokenizer::getTokenAfterPreprocessor() { yyCh = getChar(); while (isspace(yyCh) && yyCh != '\n') yyCh = getChar(); /* #directive condition */ QString directive; QString condition; while (isalpha(yyCh)) { directive += QChar(yyCh); yyCh = getChar(); } if (!directive.isEmpty()) { while (yyCh != EOF && yyCh != '\n') { if (yyCh == '\\') yyCh = getChar(); condition += yyCh; yyCh = getChar(); } condition.replace(*comment, ""); condition = condition.simplified(); /* The #if, #ifdef, #ifndef, #elif, #else, and #endif directives have an effect on the skipping stack. For instance, if the code processed so far is #if 1 #if 0 #if 1 // ... #else the skipping stack contains, from bottom to top, false true true (assuming 0 is false and 1 is true). If at least one entry of the stack is true, the tokens are skipped. This mechanism is simple yet hard to understand. */ if (directive[0] == QChar('i')) { if (directive == QString("if")) pushSkipping(!isTrue(condition)); else if (directive == QString("ifdef")) pushSkipping(!defines->exactMatch(condition)); else if (directive == QString("ifndef")) pushSkipping(defines->exactMatch(condition)); } else if (directive[0] == QChar('e')) { if (directive == QString("elif")) { bool old = popSkipping(); if (old) pushSkipping(!isTrue(condition)); else pushSkipping(true); } else if (directive == QString("else")) { pushSkipping(!popSkipping()); } else if (directive == QString("endif")) { popSkipping(); } } else if (directive == QString("define")) { if (versionX->exactMatch(condition)) yyVersion = versionX->cap(1); } } int tok; do { /* We set yyLex now, and after getToken() this will be yyPrevLex. This way, we skip over the preprocessor directive. */ qstrcpy(yyLex, yyPrevLex); /* If getToken() meets another #, it will call getTokenAfterPreprocessor() once again, which could in turn call getToken() again, etc. Unless there are 10,000 or so preprocessor directives in a row, this shouldn't overflow the stack. */ tok = getToken(); } while (yyNumPreprocessorSkipping > 0); return tok; } /* Pushes a new skipping value onto the stack. This corresponds to entering a new #if block. */ void Tokenizer::pushSkipping(bool skip) { yyPreprocessorSkipping.push(skip); if (skip) yyNumPreprocessorSkipping++; } /* Pops a skipping value from the stack. This corresponds to reaching a #endif. */ bool Tokenizer::popSkipping() { if (yyPreprocessorSkipping.isEmpty()) { yyTokLoc.warning(tr("Unexpected #elif, #else or #endif")); return true; } bool skip = yyPreprocessorSkipping.pop(); if (skip) yyNumPreprocessorSkipping--; return skip; } /* Returns true if the condition evaluates as true, otherwise false. The condition is represented by a string. Unsophisticated parsing techniques are used. The preprocessing method could be named StriNg-Oriented PreProcessing, as SNOBOL stands for StriNg-Oriented symBOlic Language. */ bool Tokenizer::isTrue(const QString &condition) { int firstOr = -1; int firstAnd = -1; int parenDepth = 0; /* Find the first logical operator at top level, but be careful about precedence. Examples: X || Y // the or X || Y || Z // the leftmost or X || Y && Z // the or X && Y || Z // the or (X || Y) && Z // the and */ for (int i = 0; i < (int) condition.length() - 1; i++) { QChar ch = condition[i]; if (ch == QChar('(')) { parenDepth++; } else if (ch == QChar(')')) { parenDepth--; } else if (parenDepth == 0) { if (condition[i + 1] == ch) { if (ch == QChar('|')) { firstOr = i; break; } else if (ch == QChar('&')) { if (firstAnd == -1) firstAnd = i; } } } } if (firstOr != -1) return isTrue(condition.left(firstOr)) || isTrue(condition.mid(firstOr + 2)); if (firstAnd != -1) return isTrue(condition.left(firstAnd)) && isTrue(condition.mid(firstAnd + 2)); QString t = condition.simplified(); if (t.isEmpty()) return true; if (t[0] == QChar('!')) return !isTrue(t.mid(1)); if (t[0] == QChar('(') && t.right(1)[0] == QChar(')')) return isTrue(t.mid(1, t.length() - 2)); if (definedX->exactMatch(t)) return defines->exactMatch(definedX->cap(1)); else return !falsehoods->exactMatch(t); } QString Tokenizer::lexeme() const { return sourceCodec->toUnicode(yyLex); } QString Tokenizer::previousLexeme() const { return sourceCodec->toUnicode(yyPrevLex); } QT_END_NAMESPACE