diff options
Diffstat (limited to 'util/lexgen')
52 files changed, 3425 insertions, 0 deletions
diff --git a/util/lexgen/README b/util/lexgen/README new file mode 100644 index 0000000..b8e9277 --- /dev/null +++ b/util/lexgen/README @@ -0,0 +1,16 @@ +Lexgen +------ + +This is a little tool to generate lexical scanners from a rather simplistic +configuration file. We use it internally in Qt to generate the scanner for the +CSS parser that is built into the toolkit (used for the widget styling and the +HTML import into QTextDocument). + +Beware, it's very slow (in generating the code) and it may not generate what +you want. But I like that it generates code that operates on QChar and friends. + +Use at your own risk ;-) + + +-- +Simon Hausmann <simon@trolltech.com> diff --git a/util/lexgen/configfile.cpp b/util/lexgen/configfile.cpp new file mode 100644 index 0000000..8d77689 --- /dev/null +++ b/util/lexgen/configfile.cpp @@ -0,0 +1,99 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +#include "configfile.h" + +#include <QFile> + +ConfigFile::SectionMap ConfigFile::parse(const QString &fileName) +{ + QFile f(fileName); + if (!f.open(QIODevice::ReadOnly)) + return ConfigFile::SectionMap(); + return parse(&f); +} + +ConfigFile::SectionMap ConfigFile::parse(QIODevice *dev) +{ + SectionMap sections; + SectionMap::Iterator currentSection = sections.end(); + + ConfigFile::SectionMap result; + int currentLineNumber = 0; + while (!dev->atEnd()) { + QString line = QString::fromUtf8(dev->readLine()).trimmed(); + ++currentLineNumber; + + if (line.isEmpty() || line.startsWith(QLatin1Char('#'))) + continue; + + if (line.startsWith(QLatin1Char('['))) { + if (!line.endsWith(']')) { + qWarning("Syntax error at line %d: Missing ']' at start of new section.", currentLineNumber); + return SectionMap(); + } + line.remove(0, 1); + line.chop(1); + const QString sectionName = line; + currentSection = sections.insert(sectionName, Section()); + continue; + } + + if (currentSection == sections.end()) { + qWarning("Syntax error at line %d: Entry found outside of any section.", currentLineNumber); + return SectionMap(); + } + + Entry e; + e.lineNumber = currentLineNumber; + + int equalPos = line.indexOf(QLatin1Char('=')); + if (equalPos == -1) { + e.key = line; + } else { + e.key = line; + e.key.truncate(equalPos); + e.key = e.key.trimmed(); + e.value = line.mid(equalPos + 1).trimmed(); + } + currentSection->append(e); + } + return sections; +} diff --git a/util/lexgen/configfile.h b/util/lexgen/configfile.h new file mode 100644 index 0000000..7bf3389 --- /dev/null +++ b/util/lexgen/configfile.h @@ -0,0 +1,81 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +#ifndef CONFIGFILE_H +#define CONFIGFILE_H + +#include <QStringList> +#include <QMap> +#include <QVector> + +struct ConfigFile +{ + struct Entry + { + inline Entry() : lineNumber(-1) {} + int lineNumber; + QString key; + QString value; + }; + struct Section : public QVector<Entry> + { + inline bool contains(const QString &key) const + { + for (int i = 0; i < count(); ++i) + if (at(i).key == key) + return true; + return false; + } + inline QString value(const QString &key, const QString &defaultValue = QString()) const + { + for (int i = 0; i < count(); ++i) + if (at(i).key == key) + return at(i).value; + return defaultValue; + } + }; + typedef QMap<QString, Section> SectionMap; + + static SectionMap parse(const QString &fileName); + static SectionMap parse(QIODevice *dev); +}; + +#endif // CONFIGFILE_H + diff --git a/util/lexgen/css2-simplified.lexgen b/util/lexgen/css2-simplified.lexgen new file mode 100644 index 0000000..3976632 --- /dev/null +++ b/util/lexgen/css2-simplified.lexgen @@ -0,0 +1,93 @@ +[Options] +case-insensitive +classname = QCssScanner_Generated + +[Code Generator Options] +MapToCode[a-z] = (ch.unicode() >= 'a' && ch.unicode() <= 'z') || ch.unicode() >= 256 +TokenPrefix = QCss:: +FileHeader = ../moc/licenseheader.txt + +[Macros] +escape = \\[^\r\n\f0-9a-f] +nmstart = [_a-z]|{escape} +nmchar = [_a-z0-9-]|{escape} +nl = \n|\r\n|\r|\f +string1 = \"([^\n\r\f\\"]|\\{nl}|{escape})*\" +string2 = \'([^\n\r\f\\']|\\{nl}|{escape})*\' +invalid1 = \"([^\n\r\f\\"]|\\{nl}|{escape})* +invalid2 = \'([^\n\r\f\\']|\\{nl}|{escape})* + +ident = -?{nmstart}{nmchar}* +name = {nmchar}+ +num = [0-9]+|[0-9]*"."[0-9]+ +string = {string1}|{string2} +invalid = {invalid1}|{invalid2} +url = ([!#$%&*-~]|{escape})* +s = [ \t\r\n\f] +w = {s}* + +[Tokens] + +S = {s}+ + +handleCommentStart() = \/\* + +CDO = "<!--" +CDC = "-->" +INCLUDES = "~=" +DASHMATCH = "|=" + +LBRACE = {w}"{" +PLUS = {w}"+" +GREATER = {w}">" +COMMA = {w}"," + +STRING = {string} +INVALID = {invalid} + +IDENT = {ident} + +HASH = "#"{name} + +ATKEYWORD_SYM = "@"{ident} + +EXCLAMATION_SYM = "!" + +#EMS = {num}em +#EXS = {num}ex +#LENGTH = {num}px +#LENGTH = {num}cm +#LENGTH = {num}mm +#LENGTH = {num}in +#LENGTH = {num}pt +#LENGTH = {num}pc +#ANGLE = {num}deg +#ANGLE = {num}rad +#ANGLE = {num}grad +#TIME = {num}ms +#TIME = {num}s +#FREQ = {num}hz +#FREQ = {num}khz +#DIMENSION = {num}{ident} +LENGTH = {num}{ident} + +PERCENTAGE = {num}% +NUMBER = {num} + +#URI = "url("{w}{string}{w}")" +#URI = "url("{w}{url}{w}")" +FUNCTION = {ident}"(" + +COLON = : +SEMICOLON = ; +RBRACE = \} +SLASH = / +MINUS = - +DOT = \. +STAR = \* +LBRACKET = \[ +RBRACKET = \] +EQUAL = \= +LPAREN = \( +RPAREN = \) +OR = \| diff --git a/util/lexgen/generator.cpp b/util/lexgen/generator.cpp new file mode 100644 index 0000000..a3c63f2 --- /dev/null +++ b/util/lexgen/generator.cpp @@ -0,0 +1,532 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "generator.h" + +#include <QFile> + +void Function::printDeclaration(CodeBlock &block, const QString &funcNamePrefix) const +{ + block << (iline ? "inline " : "") << signature(funcNamePrefix) << (iline ? QLatin1String(" {") : QLatin1String(";")); + if (!iline) + return; + + block.indent(); + QString tmp = body; + if (tmp.endsWith(QLatin1Char('\n'))) + tmp.chop(1); + foreach (QString line, tmp.split(QLatin1Char('\n'))) + block << line; + block.outdent(); + block << "}"; +} + +QString Function::signature(const QString &funcNamePrefix) const +{ + QString sig; + if (!rtype.isEmpty()) { + sig += rtype; + sig += QLatin1Char(' '); + } + sig += funcNamePrefix; + sig += fname; + if (cnst) + sig += " const"; + return sig; +} + +QString Function::definition() const +{ + if (iline) + return QString(); + + QString result; + result += signature(); + result += QLatin1String("\n{\n"); + + QString tmp = body; + + if (tmp.endsWith(QLatin1Char('\n'))) + tmp.chop(1); + if (!tmp.startsWith(QLatin1Char('\n'))) + tmp.prepend(" "); + + tmp.replace(QLatin1Char('\n'), QLatin1String("\n ")); + + result += tmp; + + result += QLatin1String("\n}\n"); + + return result; +} + +void Class::Section::printDeclaration(const Class *klass, CodeBlock &block) const +{ + foreach (Function ctor, constructors) + ctor.printDeclaration(block, klass->name()); + + if (!constructors.isEmpty()) + block.addNewLine(); + + foreach (Function func, functions) + func.printDeclaration(block); + + if (!functions.isEmpty()) + block.addNewLine(); + + foreach (QString var, variables) + block << var << ';'; +} + +void Class::addConstructor(Access access, const QString &body, const QString &_args) +{ + Function ctor; + QString args = _args; + if (!args.startsWith(QLatin1Char('(')) + && !args.endsWith(QLatin1Char(')'))) { + args.prepend('('); + args.append(')'); + } + ctor.setName(args); + ctor.addBody(body); + sections[access].constructors.append(ctor); +} + +QString Class::Section::definition(const Class *klass) const +{ + QString result; + + foreach (Function ctor, constructors) { + ctor.setName(klass->name() + "::" + klass->name() + ctor.name()); + result += ctor.definition(); + result += QLatin1Char('\n'); + } + + foreach (Function func, functions) { + if (!func.hasBody()) continue; + func.setName(klass->name() + "::" + func.name()); + result += func.definition(); + result += QLatin1Char('\n'); + } + + return result; +} + +QString Class::declaration() const +{ + CodeBlock block; + + block << QLatin1String("class ") << cname; + block << "{"; + + if (!sections[PublicMember].isEmpty()) { + block << "public:"; + block.indent(); + sections[PublicMember].printDeclaration(this, block); + block.outdent(); + } + + if (!sections[ProtectedMember].isEmpty()) { + block << "protected:"; + block.indent(); + sections[ProtectedMember].printDeclaration(this, block); + block.outdent(); + } + + if (!sections[PrivateMember].isEmpty()) { + block << "private:"; + block.indent(); + sections[PrivateMember].printDeclaration(this, block); + block.outdent(); + } + + block << "};"; + block.addNewLine(); + + return block.toString(); +} + +QString Class::definition() const +{ + return sections[PrivateMember].definition(this) + + sections[ProtectedMember].definition(this) + + sections[PublicMember].definition(this); +} + +Generator::Generator(const DFA &_dfa, const Config &config) + : dfa(_dfa), cfg(config) +{ + QList<InputType> lst = cfg.maxInputSet.toList(); + qSort(lst); + minInput = lst.first(); + maxInput = lst.last(); + + ConfigFile::Section section = config.configSections.value("Code Generator Options"); + + foreach (ConfigFile::Entry entry, section) { + if (!entry.key.startsWith(QLatin1String("MapToCode[")) + || !entry.key.endsWith(QLatin1Char(']'))) + continue; + QString range = entry.key; + range.remove(0, qstrlen("MapToCode[")); + range.chop(1); + if (range.length() != 3 + || range.at(1) != QLatin1Char('-')) { + qWarning("Invalid range for char mapping function: %s", qPrintable(range)); + continue; + } + TransitionSequence seq; + seq.first = range.at(0).unicode(); + seq.last = range.at(2).unicode(); + seq.testFunction = entry.value; + charFunctionRanges.append(seq); + } + + QString tokenPrefix = section.value("TokenPrefix"); + if (!tokenPrefix.isEmpty()) { + for (int i = 0; i < dfa.count(); ++i) + if (!dfa.at(i).symbol.isEmpty() + && !dfa.at(i).symbol.endsWith(QLatin1String("()"))) + dfa[i].symbol.prepend(tokenPrefix); + } + + headerFileName = section.value("FileHeader"); +} + +static inline bool adjacentKeys(int left, int right) { return left + 1 == right; } +//static inline bool adjacentKeys(const InputType &left, const InputType &right) +//{ return left.val + 1 == right.val; } + +static QVector<Generator::TransitionSequence> convertToSequences(const TransitionMap &transitions) +{ + QVector<Generator::TransitionSequence> sequences; + if (transitions.isEmpty()) + return sequences; + + QList<InputType> keys = transitions.keys(); + qSort(keys); + int i = 0; + Generator::TransitionSequence sequence; + sequence.first = keys.at(0); + ++i; + for (; i < keys.count(); ++i) { + if (adjacentKeys(keys.at(i - 1), keys.at(i)) + && transitions.value(keys.at(i)) == transitions.value(keys.at(i - 1))) { + continue; + } + sequence.last = keys.at(i - 1); + sequence.transition = transitions.value(sequence.last); + sequences.append(sequence); + + sequence.first = keys.at(i); + } + sequence.last = keys.at(i - 1); + sequence.transition = transitions.value(sequence.last); + sequences.append(sequence); + + return sequences; +} + +QDebug &operator<<(QDebug &debug, const Generator::TransitionSequence &seq) +{ + return debug << "[first:" << seq.first << "; last:" << seq.last << "; transition:" << seq.transition + << (seq.testFunction.isEmpty() ? QString() : QString(QString("; testfunction:" + seq.testFunction))) + << "]"; +} + +bool Generator::isSingleReferencedFinalState(int i) const +{ + return backReferenceMap.value(i) == 1 + && dfa.at(i).transitions.isEmpty() + && !dfa.at(i).symbol.isEmpty(); +} + +void Generator::generateTransitions(CodeBlock &body, const TransitionMap &transitions) +{ + if (transitions.isEmpty()) + return; + + QVector<TransitionSequence> sequences = convertToSequences(transitions); + + bool needsCharFunction = false; + if (!charFunctionRanges.isEmpty()) { + int i = 0; + while (i < sequences.count()) { + const TransitionSequence &seq = sequences.at(i); + if (!seq.testFunction.isEmpty()) { + ++i; + continue; + } + + foreach (TransitionSequence range, charFunctionRanges) + if (range.first >= seq.first && range.last <= seq.last) { + needsCharFunction = true; + + TransitionSequence left, middle, right; + + left.first = seq.first; + left.last = range.first - 1; + left.transition = seq.transition; + + middle = range; + middle.transition = seq.transition; + + right.first = range.last + 1; + right.last = seq.last; + right.transition = seq.transition; + + sequences.remove(i); + if (left.last >= left.first) { + sequences.insert(i, left); + ++i; + } + sequences.insert(i, middle); + ++i; + if (right.last >= right.first) { + sequences.insert(i, right); + ++i; + } + + i = -1; + break; + } + + ++i; + } + } + + //qDebug() << "sequence count" << sequences.count(); + //qDebug() << sequences; + + if (sequences.count() < 10 + || sequences.last().last == maxInput + || needsCharFunction) { + foreach (TransitionSequence seq, sequences) { + const bool embedFinalState = isSingleReferencedFinalState(seq.transition); + + QString brace; + if (embedFinalState) + brace = " {"; + + if (!seq.testFunction.isEmpty()) { + body << "if (" << seq.testFunction << ")" << brace; + } else if (seq.first == seq.last) { + body << "if (ch.unicode() == " << seq.first << ")" << brace; + } else { + if (seq.last < maxInput) + body << "if (ch.unicode() >= " << seq.first + << " && ch.unicode() <= " << seq.last << ")" << brace; + else + body << "if (ch.unicode() >= " << seq.first << ")" << brace; + } + body.indent(); + if (embedFinalState) { + body << "token = " << dfa.at(seq.transition).symbol << ";"; + body << "goto found;"; + + body.outdent(); + body << "}"; + } else { + body << "goto state_" << seq.transition << ";"; + body.outdent(); + } + } + } else { + QList<InputType> keys = transitions.keys(); + qSort(keys); + + body << "switch (ch.unicode()) {"; + body.indent(); + for (int k = 0; k < keys.count(); ++k) { + const InputType key = keys.at(k); + const int trans = transitions.value(key); + + QString keyStr; + if (key == '\\') + keyStr = QString("\'\\\\\'"); + else if (key >= 48 && key < 127) + keyStr = QString('\'') + QChar::fromLatin1(char(key)) + QChar('\''); + else + keyStr = QString::number(key); + + if (k < keys.count() - 1 + && transitions.value(keys.at(k + 1)) == trans) { + body << "case " << keyStr << ":"; + } else { + if (isSingleReferencedFinalState(trans)) { + body << "case " << keyStr << ": token = " << dfa.at(trans).symbol << "; goto found;"; + } else { + body << "case " << keyStr << ": goto state_" << trans << ";"; + } + } + } + body.outdent(); + body << "}"; + } +} + +QString Generator::generate() +{ + Class klass(cfg.className); + + klass.addMember(Class::PublicMember, "QString input"); + klass.addMember(Class::PublicMember, "int pos"); + klass.addMember(Class::PublicMember, "int lexemStart"); + klass.addMember(Class::PublicMember, "int lexemLength"); + + { + CodeBlock body; + body << "input = inp;"; + body << "pos = 0;"; + body << "lexemStart = 0;"; + body << "lexemLength = 0;"; + klass.addConstructor(Class::PublicMember, body, "const QString &inp"); + } + + { + Function next("QChar", "next()"); + next.setInline(true); + if (cfg.caseSensitivity == Qt::CaseSensitive) + next.addBody("return (pos < input.length()) ? input.at(pos++) : QChar();"); + else + next.addBody("return (pos < input.length()) ? input.at(pos++).toLower() : QChar();"); + klass.addMember(Class::PublicMember, next); + } + + /* + { + Function lexem("QString", "lexem()"); + lexem.setConst(true); + lexem.setInline(true); + lexem.addBody("return input.mid(lexemStart, lexemLength);"); + klass.addMember(Class::PublicMember, lexem); + } + */ + + for (int i = 0; i < dfa.count(); ++i) + if (dfa.at(i).symbol.endsWith(QLatin1String("()"))) { + Function handlerFunc("int", dfa.at(i).symbol); + klass.addMember(Class::PublicMember, handlerFunc); + } + + Function lexFunc; + lexFunc.setReturnType("int"); + lexFunc.setName("lex()"); + + CodeBlock body; + body << "lexemStart = pos;"; + body << "lexemLength = 0;"; + body << "int lastAcceptingPos = -1;"; + body << "int token = -1;"; + body << "QChar ch;"; + body.addNewLine(); + + backReferenceMap.clear(); + foreach (State s, dfa) + foreach (int state, s.transitions) + backReferenceMap[state]++; + + bool haveSingleReferencedFinalState = false; + + for (int i = 0; i < dfa.count(); ++i) { + if (isSingleReferencedFinalState(i)) { + haveSingleReferencedFinalState = true; + continue; + } + + if (i > 0) + body << "state_" << i << ":"; + else + body << "// initial state"; + + body.indent(); + + if (!dfa.at(i).symbol.isEmpty()) { + body << "lastAcceptingPos = pos;"; + body << "token = " << dfa.at(i).symbol << ";"; + } + + body.outdent(); + + body.indent(); + + if (!dfa.at(i).transitions.isEmpty()) { + body << "ch = next();"; + generateTransitions(body, dfa.at(i).transitions); + } + + body << "goto out;"; + + body.outdent(); + } + + if (haveSingleReferencedFinalState) { + body << "found:"; + body << "lastAcceptingPos = pos;"; + body.addNewLine(); + } + + body << "out:"; + body << "if (lastAcceptingPos != -1) {"; + body.indent(); + body << "lexemLength = lastAcceptingPos - lexemStart;"; + body << "pos = lastAcceptingPos;"; + body.outdent(); + body << "}"; + body << "return token;"; + + lexFunc.addBody(body); + + klass.addMember(Class::PublicMember, lexFunc); + + QString header; + QFile headerFile(headerFileName); + if (!headerFileName.isEmpty() + && headerFile.exists() + && headerFile.open(QIODevice::ReadOnly)) { + header = QString::fromUtf8(headerFile.readAll()); + } + + header += QLatin1String("// auto generated. DO NOT EDIT.\n"); + + return header + klass.declaration() + klass.definition(); +} + diff --git a/util/lexgen/generator.h b/util/lexgen/generator.h new file mode 100644 index 0000000..047378b --- /dev/null +++ b/util/lexgen/generator.h @@ -0,0 +1,221 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +#ifndef GENERATOR_H +#define GENERATOR_H + +#include <QTextStream> +#include <QStringList> + +#include "nfa.h" + +class LineStream +{ +private: + struct SharedStream + { + int ref; + QTextStream *stream; + }; + +public: + LineStream(QTextStream *textStream) + { + shared = new SharedStream; + shared->ref = 1; + shared->stream = textStream; + } + LineStream(const LineStream &other) + { + shared = other.shared; + shared->ref++; + } + LineStream &operator=(const LineStream &other) + { + if (this == &other) + return *this; + LineStream copy(other); // keep refcount up + qSwap(*shared, *other.shared); + return *this; + } + ~LineStream() + { + if (!--shared->ref) { + (*shared->stream) << endl; + delete shared; + } + } + + template <typename T> + LineStream &operator<<(const T &value) + { (*shared->stream) << value; return *this; } + + SharedStream *shared; +}; + +class CodeBlock +{ +public: + inline CodeBlock() { stream.setString(&output, QIODevice::WriteOnly); } + + inline void indent() { indentStr += QLatin1String(" "); } + inline void outdent() { indentStr.remove(0, 4); } + + template <typename T> + LineStream operator<<(const T &value) + { stream << indentStr; stream << value; return LineStream(&stream); } + + inline void addNewLine() { stream << endl; } + + inline QString toString() const { stream.flush(); return output; } + +private: + QString output; + mutable QTextStream stream; + QString indentStr; +}; + +class Function +{ +public: + inline Function(const QString &returnType, const QString &name) + : rtype(returnType), fname(name), iline(false), cnst(false) {} + inline Function() : iline(false), cnst(false) {} + + inline void setName(const QString &name) { fname = name; } + inline QString name() const { return fname; } + + inline void setInline(bool i) { iline = i; } + inline bool isInline() const { return iline; } + + inline void setReturnType(const QString &type) { rtype = type; } + inline QString returnType() const { return rtype; } + + inline void addBody(const QString &_body) { body += _body; } + inline void addBody(const CodeBlock &block) { body += block.toString(); } + inline bool hasBody() const { return !body.isEmpty(); } + + inline void setConst(bool konst) { cnst = konst; } + inline bool isConst() const { return cnst; } + + void printDeclaration(CodeBlock &block, const QString &funcNamePrefix = QString()) const; + QString definition() const; + +private: + QString signature(const QString &funcNamePrefix = QString()) const; + + QString rtype; + QString fname; + QString body; + bool iline; + bool cnst; +}; + +class Class +{ +public: + enum Access { PublicMember, ProtectedMember, PrivateMember }; + + inline Class(const QString &name) : cname(name) {} + + inline void setName(const QString &name) { cname = name; } + inline QString name() const { return cname; } + + inline void addMember(Access access, const QString &name) + { sections[access].variables.append(name); } + inline void addMember(Access access, const Function &func) + { sections[access].functions.append(func); } + + void addConstructor(Access access, const QString &body, const QString &args = QString()); + inline void addConstructor(Access access, const CodeBlock &body, const QString &args = QString()) + { addConstructor(access, body.toString(), args); } + + QString declaration() const; + QString definition() const; + +private: + QString cname; + struct Section + { + QVector<Function> functions; + QStringList variables; + QVector<Function> constructors; + + inline bool isEmpty() const + { return functions.isEmpty() && variables.isEmpty() && constructors.isEmpty(); } + + void printDeclaration(const Class *klass, CodeBlock &block) const; + QString definition(const Class *klass) const; + }; + + Section sections[3]; +}; + +class Generator +{ +public: + Generator(const DFA &dfa, const Config &config); + + QString generate(); + +private: + void generateTransitions(CodeBlock &body, const TransitionMap &transitions); + bool isSingleReferencedFinalState(int i) const; + + DFA dfa; + Config cfg; + InputType minInput; + InputType maxInput; + QHash<int, int> backReferenceMap; + QString headerFileName; +public: + struct TransitionSequence + { + inline TransitionSequence() : first(-1), last(-1), transition(-1) {} + InputType first; + InputType last; + int transition; + QString testFunction; + }; +private: + QVector<TransitionSequence> charFunctionRanges; +}; + +#endif diff --git a/util/lexgen/global.h b/util/lexgen/global.h new file mode 100644 index 0000000..01f915d --- /dev/null +++ b/util/lexgen/global.h @@ -0,0 +1,113 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef GLOBAL_H +#define GLOBAL_H + +#include <QHash> +#include <QDataStream> +#include <QSet> + +#include "configfile.h" + +#if 1 +typedef int InputType; + +enum SpecialInputType { + DigitInput, + SpaceInput, + Letter +}; + +#else + +enum SpecialInputType { + NoSpecialInput = 0, + DigitInput, + SpaceInput, + LetterOrNumberInput +}; + +struct InputType +{ + inline InputType() : val(0), specialInput(NoSpecialInput) {} + inline InputType(const int &val) : val(val), specialInput(NoSpecialInput) {} + + inline operator int() const { return val; } + + inline bool operator==(const InputType &other) const + { return val == other.val; } + inline bool operator!=(const InputType &other) const + { return val != other.val; } + + int val; + SpecialInputType specialInput; +}; + +inline int qHash(const InputType &t) { return qHash(t.val); } + +inline QDataStream &operator<<(QDataStream &stream, const InputType &i) +{ + return stream << i; +} + +inline QDataStream &operator>>(QDataStream &stream, InputType &i) +{ + return stream >> i; +} + +#endif + +const InputType Epsilon = -1; + +struct Config +{ + inline Config() : caseSensitivity(Qt::CaseSensitive), debug(false), cache(false) {} + QSet<InputType> maxInputSet; + Qt::CaseSensitivity caseSensitivity; + QString className; + bool debug; + bool cache; + QString ruleFile; + ConfigFile::SectionMap configSections; +}; + +#endif // GLOBAL_H diff --git a/util/lexgen/lexgen.lexgen b/util/lexgen/lexgen.lexgen new file mode 100644 index 0000000..33efb8b --- /dev/null +++ b/util/lexgen/lexgen.lexgen @@ -0,0 +1,24 @@ +[Options] +case-sensitive +classname = RegExpTokenizer + +[Code Generator Options] +TokenPrefix = RE2NFA:: + +[Macros] +Escape = \\.{1} + +[Tokens] +TOK_QUOTED_STRING = \"[^\"]*\" +TOK_STRING = [^\{\}\(\)\,\*\|\?\.\+\[]|{Escape} +TOK_SEQUENCE = \[([^\]]|(\\\]))*\] +TOK_LBRACE = \{ +TOK_RBRACE = \} +TOK_LPAREN = \( +TOK_RPAREN = \) +TOK_COMMA = \, +TOK_STAR = \* +TOK_OR = \| +TOK_QUESTION = \? +TOK_DOT = \. +TOK_PLUS = \+ diff --git a/util/lexgen/lexgen.pri b/util/lexgen/lexgen.pri new file mode 100644 index 0000000..b36e00e --- /dev/null +++ b/util/lexgen/lexgen.pri @@ -0,0 +1,3 @@ +VPATH += $$PWD +SOURCES += nfa.cpp configfile.cpp re2nfa.cpp +INCLUDEPATH += $$PWD diff --git a/util/lexgen/lexgen.pro b/util/lexgen/lexgen.pro new file mode 100644 index 0000000..89ac84d --- /dev/null +++ b/util/lexgen/lexgen.pro @@ -0,0 +1,6 @@ +TEMPLATE = app +TARGET = lexgen +include(lexgen.pri) +SOURCES += main.cpp \ + generator.cpp +QT = core diff --git a/util/lexgen/main.cpp b/util/lexgen/main.cpp new file mode 100644 index 0000000..b7b529a --- /dev/null +++ b/util/lexgen/main.cpp @@ -0,0 +1,323 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "nfa.h" +#include "re2nfa.h" +#include "configfile.h" +#include "generator.h" + +#include <QFile> +#include <QCoreApplication> +#include <QFileInfo> +#include <QDateTime> + +struct Symbol +{ + QString token; + QString lexem; +}; + +static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0) +{ + QList<Symbol> symbols; + Symbol lastSymbol; + int state = 0; + int lastAcceptingState = -1; + QString lastAcceptingLexem; + int lastAcceptingPos = -1; + for (int i = 0; i < input.length(); ++i) { + QChar ch = input.at(i); + QChar chForInput = ch; + if (cfg->caseSensitivity == Qt::CaseInsensitive) + chForInput = chForInput.toLower(); + int next = dfa.at(state).transitions.value(chForInput.unicode()); + if (cfg->debug) + qDebug() << "input" << input.at(i) << "leads to state" << next; + if (next) { + lastSymbol.lexem.append(input.at(i)); + lastSymbol.token = dfa.at(next).symbol; + if (!lastSymbol.token.isEmpty()) { + lastAcceptingState = next; + lastAcceptingLexem = lastSymbol.lexem; + lastAcceptingPos = i; + } + state = next; + } else { + if (lastAcceptingState != -1) { + if (cfg->debug) + qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos; + Symbol s; + s.token = dfa.at(lastAcceptingState).symbol; + s.lexem = lastAcceptingLexem; + symbols << s; + lastSymbol = Symbol(); + state = 0; + i = lastAcceptingPos; + lastAcceptingPos = -1; + lastAcceptingState = -1; + continue; + } + if (state == 0 || lastSymbol.token.isEmpty()) { + if (cfg->debug) + qDebug() << "invalid input"; + if (ok) + *ok = false; + return symbols; + } + if (cfg->debug) + qDebug() << "appending symbol with token" << lastSymbol.token; + symbols << lastSymbol; + lastSymbol = Symbol(); + state = 0; + lastAcceptingState = -1; + --i; + } + } + if (!lastSymbol.token.isEmpty()) { + if (cfg->debug) + qDebug() << "appending (last) symbol with token" << lastSymbol.token; + symbols << lastSymbol; + } else if (lastAcceptingState != -1) { + if (cfg->debug) + qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol; + Symbol s; + s.lexem = lastAcceptingLexem; + s.token = dfa.at(lastAcceptingState).symbol; + symbols << s; + } + if (ok) + *ok = true; + return symbols; +} + +static QSet<InputType> determineMaxInputSet(const ConfigFile::Section §ion) +{ + QSet<InputType> set; + + QString inputTypeName; + + foreach (const ConfigFile::Entry &entry, section) + if (entry.key == QLatin1String("InputType")) { + if (!inputTypeName.isEmpty()) { + qWarning("Error: InputType field specified multiple times in config file"); + return QSet<InputType>(); + } + inputTypeName = entry.value; + } + + if (inputTypeName.isEmpty()) + inputTypeName = "quint8"; + + if (inputTypeName == "quint8") { + for (int i = 1; i < 256; ++i) + set.insert(i); + } /* else if ### */ + else { + qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName)); + return QSet<InputType>(); + } + + return set; +} + +static bool loadConfig(const QString &ruleFile, Config *cfg) +{ + ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile); + if (sections.isEmpty()) { + qWarning("Error parsing %s", qPrintable(ruleFile)); + return false; + } + + QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options")); + if (maxInputSet.isEmpty()) + return false; + + Qt::CaseSensitivity cs = Qt::CaseInsensitive; + if (sections.value("Options").contains("case-sensitive")) + cs = Qt::CaseSensitive; + + cfg->configSections = sections; + cfg->caseSensitivity = cs; + cfg->className = sections.value("Options").value("classname", "Scanner"); + cfg->maxInputSet = maxInputSet; + cfg->ruleFile = ruleFile; + return true; +} + +static DFA generateMachine(const Config &cfg) +{ + if (cfg.cache) { + QFileInfo ruleInfo(cfg.ruleFile); + QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa"); + if (cacheInfo.exists() + && cacheInfo.lastModified() > ruleInfo.lastModified()) { + QFile f(cacheInfo.absoluteFilePath()); + f.open(QIODevice::ReadOnly); + QDataStream stream(&f); + DFA machine; + stream >> machine; + return machine; + } + } + + QMap<QString, NFA> macros; + foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) { + int errCol = 0; + if (cfg.debug) + qDebug() << "parsing" << e.value; + NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol); + if (nfa.isEmpty()) { + qWarning("Parse error in line %d column %d", e.lineNumber, errCol); + return DFA(); + } + macros.insert(e.key, nfa); + } + + if (!cfg.configSections.contains("Tokens")) { + qWarning("Rule file does not contain a [Tokens] section!"); + return DFA(); + } + + QVector<NFA> tokens; + + foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) { + int errCol = 0; + if (cfg.debug) + qDebug() << "parsing" << e.value; + NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol); + if (tok.isEmpty()) { + qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData()); + return DFA(); + } + tok.setTerminationSymbol(e.key); + tokens.append(tok); + } + + NFA giganticStateMachine; + foreach (NFA nfa, tokens) + if (giganticStateMachine.isEmpty()) + giganticStateMachine = nfa; + else + giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa); + + DFA result = giganticStateMachine.toDFA().minimize(); + if (cfg.cache) { + QFileInfo ruleInfo(cfg.ruleFile); + QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa"); + QFile f(cacheInfo.absoluteFilePath()); + f.open(QIODevice::WriteOnly | QIODevice::Truncate); + QDataStream stream(&f); + stream << result; + } + return result; +} + +#if !defined(AUTOTEST) +int main(int argc, char **argv) +{ + QCoreApplication app(argc, argv); + QString ruleFile; + Config cfg; + + const QStringList arguments = app.arguments().mid(1); + cfg.debug = arguments.contains("-debug"); + const bool testRules = arguments.contains("-test"); + cfg.cache = arguments.contains("-cache"); + + foreach (const QString &arg, arguments) + if (!arg.startsWith(QLatin1Char('-'))) { + ruleFile = arg; + break; + } + + if (ruleFile.isEmpty()) { + qWarning("usage: lexgen [-test rulefile"); + qWarning(" "); + qWarning(" the -test option will cause lexgen to interpret standard input"); + qWarning(" according to the specified rules and print out pairs of token and"); + qWarning(" lexical element"); + return 1; + } + + if (!loadConfig(ruleFile, &cfg)) + return 1; + + DFA machine = generateMachine(cfg); + if (machine.isEmpty()) + return 1; + + if (testRules) { + qWarning("Testing:"); + QString input = QTextStream(stdin).readAll(); + /* + qDebug() << "NFA has" << machine.stateCount() << "states"; + qDebug() << "Converting to DFA... (this may take a while)"; + DFA dfa = machine.toDFA(); + qDebug() << "DFA has" << dfa.count() << "states"; + qDebug() << "Minimizing..."; + dfa = dfa.minimize(); + qDebug() << "Minimized DFA has" << dfa.count() << "states"; + */ + DFA dfa = machine; + if (cfg.debug) + qDebug() << "tokenizing" << input; + bool ok = false; + QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok); + if (symbols.isEmpty()) { + qWarning("No tokens produced!"); + } else { + foreach (Symbol s, symbols) + qDebug() << s.token << ":" << s.lexem; + } + if (ok) + qDebug() << symbols.count() << "tokens produced."; + else + qDebug() << "Error while tokenizing!"; + } else { + Generator gen(machine, cfg); + QTextStream(stdout) + << gen.generate(); + } + + return 0; +} +#endif + diff --git a/util/lexgen/nfa.cpp b/util/lexgen/nfa.cpp new file mode 100644 index 0000000..701a5e0 --- /dev/null +++ b/util/lexgen/nfa.cpp @@ -0,0 +1,508 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +#include "nfa.h" +#include <QSet> +#include <limits.h> + +NFA NFA::createSingleInputNFA(InputType input) +{ + NFA result; + result.initialize(2); + result.addTransition(result.initialState, input, result.finalState); + return result; +} + +NFA NFA::createSymbolNFA(const QString &symbol) +{ + NFA result = NFA::createSingleInputNFA(Epsilon); + result.states[result.finalState].symbol = symbol; + return result; +} + +void NFA::initialize(int size) +{ + states.resize(size); + states.fill(State()); + initialState = 0; + finalState = size - 1; +} + +void NFA::addTransition(int from, InputType input, int to) +{ + assertValidState(from); + assertValidState(to); + + states[from].transitions.insertMulti(input, to); +} + +void NFA::copyFrom(const NFA &other, int baseState) +{ + assertValidState(baseState); + assertValidState(baseState + other.states.count() - 1); + + for (int i = 0; i < other.states.count(); ++i) { + State s = other.states.at(i); + + for (TransitionMap::Iterator it = s.transitions.begin(), + end = s.transitions.end(); it != end; ++it) + *it += baseState; + + states[baseState + i] = s; + } +} + +void NFA::initializeFromPair(const NFA &a, const NFA &b, + int *initialA, int *finalA, + int *initialB, int *finalB) +{ + initialize(a.states.count() + b.states.count() + 2); + + int baseIdxA = 1; + int baseIdxB = 1 + a.states.count(); + + *initialA = a.initialState + baseIdxA; + *finalA = a.finalState + baseIdxA; + + *initialB = b.initialState + baseIdxB; + *finalB = b.finalState + baseIdxB; + + copyFrom(a, baseIdxA); + copyFrom(b, baseIdxB); +} + +NFA NFA::createAlternatingNFA(const NFA &a, const NFA &b) +{ + NFA result; + + int newInitialA, newFinalA, + newInitialB, newFinalB; + + result.initializeFromPair(a, b, &newInitialA, &newFinalA, + &newInitialB, &newFinalB); + + result.addTransition(result.initialState, Epsilon, newInitialA); + result.addTransition(result.initialState, Epsilon, newInitialB); + + result.addTransition(newFinalA, Epsilon, result.finalState); + result.addTransition(newFinalB, Epsilon, result.finalState); + + return result; +} + +NFA NFA::createConcatenatingNFA(const NFA &a, const NFA &b) +{ + NFA result; + + int initialA, finalA, + initialB, finalB; + + result.initializeFromPair(a, b, &initialA, &finalA, &initialB, &finalB); + + result.addTransition(result.initialState, Epsilon, initialA); + result.addTransition(finalA, Epsilon, initialB); + result.addTransition(finalB, Epsilon, result.finalState); + return result; +} + +NFA NFA::createOptionalNFA(const NFA &a) +{ + NFA result; + + result.initialize(a.states.count() + 2); + + int baseIdxA = 1; + int initialA = a.initialState + baseIdxA; + int finalA = a.finalState + baseIdxA; + + result.copyFrom(a, baseIdxA); + + result.addTransition(result.initialState, Epsilon, initialA); + result.addTransition(result.initialState, Epsilon, result.finalState); + + result.addTransition(finalA, Epsilon, initialA); + result.addTransition(finalA, Epsilon, result.finalState); + + return result; +} + +NFA NFA::createStringNFA(const QByteArray &str) +{ + NFA result; + foreach (char c, str) { + NFA ch = NFA::createSingleInputNFA(c); + if (result.isEmpty()) + result = ch; + else + result = NFA::createConcatenatingNFA(result, ch); + } + return result; +} + +NFA NFA::createSetNFA(const QSet<InputType> &set) +{ + NFA result; + result.initialize(set.count() + 2); + + int state = 1; + for (QSet<InputType>::ConstIterator it = set.constBegin(), end = set.constEnd(); + it != end; ++it, ++state) { + result.addTransition(result.initialState, Epsilon, state); + result.addTransition(state, *it, result.finalState); + } + + /* + foreach (InputType input, set) { + NFA ch = NFA::createSingleInputNFA(input); + if (result.isEmpty()) + result = ch; + else + result = NFA::createAlternatingNFA(result, ch); + } + */ + return result; +} + +NFA NFA::createZeroOrOneNFA(const NFA &a) +{ + NFA epsilonNFA = createSingleInputNFA(Epsilon); + return NFA::createAlternatingNFA(a, epsilonNFA); +} + +NFA NFA::applyQuantity(const NFA &a, int minOccurrences, int maxOccurrences) +{ + NFA result = a; + NFA epsilonNFA = createSingleInputNFA(Epsilon); + + if (minOccurrences == 0) { + result = NFA::createAlternatingNFA(result, epsilonNFA); + } else { + minOccurrences--; + } + maxOccurrences--; + + for (int i = 0; i < minOccurrences; ++i) + result = NFA::createConcatenatingNFA(result, a); + + for (int i = minOccurrences; i < maxOccurrences; ++i) + result = NFA::createConcatenatingNFA(result, NFA::createAlternatingNFA(a, epsilonNFA)); + + return result; +} + +void NFA::debug() +{ + qDebug() << "NFA has" << states.count() << "states"; + qDebug() << "initial state is" << initialState; + qDebug() << "final state is" << finalState; + + for (int i = 0; i < states.count(); ++i) { + const State &s = states.at(i); + for (TransitionMap::ConstIterator it = s.transitions.constBegin(), + end = s.transitions.constEnd(); it != end; ++it) + qDebug() << "transition from state" << i << "to" << it.value() << "through" + << (it.key() == Epsilon ? QString("Epsilon") : QString(char(it.key()))); + if (!s.symbol.isEmpty()) + qDebug() << "State" << i << "leads to symbol" << s.symbol; + } +} + +// helper +typedef QSet<int> DFAState; + +// that's a bad hash, but it's good enough for us +// and it allows us to use the nice QHash API :) +inline uint qHash(const DFAState &state) +{ + uint val = 0; + foreach (int s, state) + val |= qHash(s); + return val; +} + +DFA NFA::toDFA() const +{ + DFA result; + result.reserve(states.count()); + + QHash<QString, int> symbolReferenceCounts; + { + QSet<int> symbolStates; + for (int i = 0; i < states.count(); ++i) + if (!states.at(i).symbol.isEmpty()) + symbolStates.insert(i); + + QHash<int, QString> epsilonStates; + for (int i = 0; i < states.count(); ++i) { + const State &s = states.at(i); + for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd(); + transition != end; ++transition) + if (transition.key() == Epsilon && symbolStates.contains(transition.value())) + epsilonStates.insert(i, states.at(transition.value()).symbol); + } + + int lastCount; + do { + lastCount = epsilonStates.count(); + for (int i = 0; i < states.count(); ++i) { + const State &s = states.at(i); + for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd(); + transition != end; ++transition) + if (transition.key() == Epsilon && epsilonStates.contains(transition.value())) + epsilonStates.insert(i, epsilonStates.value(transition.value())); + } + + } while (lastCount != epsilonStates.count()); + + for (int i = 0; i < states.count(); ++i) { + const State &s = states.at(i); + for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd(); + transition != end; ++transition) { + if (transition.key() == Epsilon) + continue; + if (symbolStates.contains(transition.value())) { + const QString symbol = states.at(transition.value()).symbol; + symbolReferenceCounts[symbol]++; + } else if (epsilonStates.contains(transition.value())) { + const QString symbol = epsilonStates.value(transition.value()); + symbolReferenceCounts[symbol]++; + } + } + } + /* + for (QHash<QString, int>::ConstIterator symIt = symbolReferenceCounts.constBegin(), symEnd = symbolReferenceCounts.constEnd(); + symIt != symEnd; ++symIt) + qDebug() << "symbol" << symIt.key() << "is reached" << symIt.value() << "times"; + */ + } + + + QSet<InputType> validInput; + foreach (const State &s, states) + for (TransitionMap::ConstIterator it = s.transitions.constBegin(), + end = s.transitions.constEnd(); it != end; ++it) + if (it.key() != Epsilon) + validInput.insert(it.key()); + + // A DFA state can consist of multiple NFA states. + // the dfaStateMap maps from these to the actual + // state index within the resulting DFA vector + QHash<DFAState, int> dfaStateMap; + QStack<DFAState> pendingDFAStates; + + DFAState startState = epsilonClosure(QSet<int>() << initialState); + + result.resize(1); + dfaStateMap.insert(startState, 0); + + pendingDFAStates.push(startState); + + while (!pendingDFAStates.isEmpty()) { + DFAState state = pendingDFAStates.pop(); +// qDebug() << "processing" << state << "from the stack of pending states"; + + foreach (InputType input, validInput) { + + QSet<int> reachableStates; + + foreach (int nfaState, state) { + const TransitionMap &transitions = states.at(nfaState).transitions; + TransitionMap::ConstIterator it = transitions.find(input); + while (it != transitions.constEnd() && it.key() == input) { + reachableStates.insert(it.value()); + ++it; + } + } + + if (reachableStates.isEmpty()) + continue; + +// qDebug() << "can reach" << reachableStates << "from input" << char(input); + + QSet<int> closure = epsilonClosure(reachableStates); + +// qDebug() << "closure is" << closure; + + if (!dfaStateMap.contains(closure)) { + int dfaState = result.count(); + result.append(State()); + + QString symbol; + int refCount = INT_MAX; + foreach (int nfaState, closure) + if (!states.at(nfaState).symbol.isEmpty()) { +// qDebug() << "closure also contains symbol" << states.at(nfaState).symbol; + QString candidate = states.at(nfaState).symbol; + int candidateRefCount =symbolReferenceCounts.value(candidate, INT_MAX); + if (candidateRefCount < refCount) { + refCount = candidateRefCount; + symbol = candidate; + } + } + if (!symbol.isEmpty()) + result.last().symbol = symbol; + + dfaStateMap.insert(closure, dfaState); + + Q_ASSERT(!pendingDFAStates.contains(closure)); + pendingDFAStates.prepend(closure); + } + + result[dfaStateMap.value(state)].transitions.insert(input, dfaStateMap.value(closure)); + } + } + + return result; +} + +QSet<int> NFA::epsilonClosure(const QSet<int> &initialClosure) const +{ + QSet<int> closure = initialClosure; + closure.reserve(closure.count() * 4); + + QStack<int> stateStack; + stateStack.resize(closure.count()); + qCopy(closure.constBegin(), closure.constEnd(), stateStack.begin()); + + while (!stateStack.isEmpty()) { + int t = stateStack.pop(); + const TransitionMap &transitions = states.at(t).transitions; + TransitionMap::ConstIterator it = transitions.find(Epsilon); + while (it != transitions.constEnd() && it.key() == Epsilon) { + const int u = it.value(); + if (!closure.contains(u)) { + closure.insert(u); + stateStack.push(u); + } + ++it; + } + } + + return closure; +} + +void NFA::setTerminationSymbol(const QString &symbol) +{ + states[finalState].symbol = symbol; +} + +void DFA::debug() const +{ + qDebug() << "DFA has" << count() << "states"; + + for (int i = 0; i < count(); ++i) { + const State &s = at(i); + if (s.transitions.isEmpty()) { + qDebug() << "State" << i << "has no transitions"; + } else { + for (TransitionMap::ConstIterator it = s.transitions.constBegin(), + end = s.transitions.constEnd(); it != end; ++it) + qDebug() << "transition from state" << i << "to" << it.value() << "through" + << (it.key() == Epsilon ? QString("Epsilon") : QString(char(it.key()))); + } + if (!s.symbol.isEmpty()) + qDebug() << "State" << i << "leads to symbol" << s.symbol; + } + +} + +DFA DFA::minimize() const +{ + QVector<bool> inequivalentStates(count() * count()); + inequivalentStates.fill(false); + + for (int i = 0; i < count(); ++i) + for (int j = 0; j < i; ++j) { + if (i != j && at(i).symbol != at(j).symbol) + inequivalentStates[i * count() + j] = true; + } + + bool done; + do { + done = true; + for (int i = 0; i < count(); ++i) + for (int j = 0; j < count(); ++j) { + if (i == j) + continue; + + if (inequivalentStates[i * count() + j]) + continue; + + if (at(i).transitions.keys() != at(j).transitions.keys()) { + inequivalentStates[i * count() + j] = true; + done = false; + continue; + } + + foreach (InputType a, at(i).transitions.keys()) { + int r = at(i).transitions.value(a, -1); + if (r == -1) + continue; + int s = at(j).transitions.value(a, -1); + if (s == -1) + continue; + + if (inequivalentStates[r * count() + s] + || r == s) { + inequivalentStates[i * count() + j] = true; + done = false; + break; + } + } + } + } while (!done); + + QHash<int, int> statesToEliminate; + for (int i = 0; i < count(); ++i) + for (int j = 0; j < i; ++j) + if (!inequivalentStates[i * count() + j]) { + statesToEliminate.insertMulti(i, j); + } + + /* + qDebug() << "states to eliminiate:" << statesToEliminate.count();; + qDebug() << "merging" << statesToEliminate; + debug(); + */ + + return *this; +} + + diff --git a/util/lexgen/nfa.h b/util/lexgen/nfa.h new file mode 100644 index 0000000..1f25071 --- /dev/null +++ b/util/lexgen/nfa.h @@ -0,0 +1,127 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef NFA_H +#define NFA_H + +#include <QMap> +#include <QHash> +#include <QString> +#include <QVector> +#include <QDebug> +#include <QStack> +#include <QByteArray> + +#include "global.h" + +typedef QHash<InputType, int> TransitionMap; + +struct State +{ + QString symbol; + TransitionMap transitions; +}; + +inline QDataStream &operator<<(QDataStream &stream, const State &state) +{ + return stream << state.symbol << state.transitions; +} + +inline QDataStream &operator>>(QDataStream &stream, State &state) +{ + return stream >> state.symbol >> state.transitions; +} + +struct DFA : public QVector<State> +{ + void debug() const; + DFA minimize() const; +}; + +class NFA +{ +public: + static NFA createSingleInputNFA(InputType input); + static NFA createSymbolNFA(const QString &symbol); + static NFA createAlternatingNFA(const NFA &a, const NFA &b); + static NFA createConcatenatingNFA(const NFA &a, const NFA &b); + static NFA createOptionalNFA(const NFA &a); + + // convenience + static NFA createStringNFA(const QByteArray &str); + static NFA createSetNFA(const QSet<InputType> &set); + static NFA createZeroOrOneNFA(const NFA &a); + static NFA applyQuantity(const NFA &a, int minOccurrences, int maxOccurrences); + + void setTerminationSymbol(const QString &symbol); + + DFA toDFA() const; + + inline bool isEmpty() const { return states.isEmpty(); } + inline int stateCount() const { return states.count(); } + + void debug(); + +private: + void initialize(int size); + void addTransition(int from, InputType input, int to); + void copyFrom(const NFA &other, int baseState); + + void initializeFromPair(const NFA &a, const NFA &b, + int *initialA, int *finalA, + int *initialB, int *finalB); + + QSet<int> epsilonClosure(const QSet<int> &initialClosure) const; + + inline void assertValidState(int state) + { Q_UNUSED(state); Q_ASSERT(state >= 0); Q_ASSERT(state < states.count()); } + +#if defined(AUTOTEST) +public: +#endif + int initialState; + int finalState; + + QVector<State> states; +}; + +#endif // NFA_H + diff --git a/util/lexgen/re2nfa.cpp b/util/lexgen/re2nfa.cpp new file mode 100644 index 0000000..ed58d50 --- /dev/null +++ b/util/lexgen/re2nfa.cpp @@ -0,0 +1,547 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "re2nfa.h" +#include "tokenizer.cpp" + +RE2NFA::RE2NFA(const QMap<QString, NFA> ¯os, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs) + : macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs) +{ +} + +NFA RE2NFA::parse(const QString &expression, int *errCol) +{ + tokenize(expression); + + if (symbols.isEmpty()) + return NFA(); + + index = 0; + + NFA result = parseExpr(); + if (result.isEmpty()) { + if (errCol) + *errCol = errorColumn; + } + return result; +} + +NFA RE2NFA::parseExpr() +{ + NFA value = parseBranch(); + while (test(TOK_OR)) { + NFA rhs = parseBranch(); + value = NFA::createAlternatingNFA(value, rhs); + } + return value; +} + +NFA RE2NFA::parseBranch() +{ + NFA value = parsePiece(); + if (!hasNext()) + return value; + NFA next; + do { + next = parsePiece(); + if (!next.isEmpty()) + value = NFA::createConcatenatingNFA(value, next); + } while (!next.isEmpty() && hasNext()); + return value; +} + +NFA RE2NFA::parsePiece() +{ + NFA atom = parseAtom(); + if (atom.isEmpty() || !hasNext()) + return atom; + return parseMaybeQuantifier(atom); +} + +NFA RE2NFA::parseAtom() +{ + // #### + switch (next()) { + case TOK_STRING: + return createCharNFA(); + case TOK_LPAREN: { + NFA subExpr = parseExpr(); + next(TOK_RPAREN); + return subExpr; + } + case TOK_LBRACE: { + QString macroName = lexemUntil(TOK_RBRACE); + QMap<QString, NFA>::ConstIterator macro = macros.find(macroName); + if (macro == macros.end()) { + qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName)); + return NFA(); + } + return *macro; + } + case TOK_LBRACKET: { + NFA set = parseSet(); + next(TOK_RBRACKET); + return set; + } + case TOK_SEQUENCE: + return parseSet2(); + case TOK_DOT: + return NFA::createSetNFA(maxInputSet); + default: + prev(); + return NFA(); + } +} + +NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa) +{ + // #### + switch (next()) { + case TOK_STAR: + return NFA::createOptionalNFA(nfa); + case TOK_QUESTION: + return NFA::createZeroOrOneNFA(nfa); + case TOK_PLUS: + return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa)); + case TOK_LBRACE: { + const int rewind = index - 1; + + QString lexemBeforeComma; + QString lexemAfterComma; + bool seenComma = false; + forever { + if (test(TOK_COMMA)) { + if (seenComma) { + errorColumn = symbol().column; + return NFA(); + } + seenComma = true; + } else if (test(TOK_RBRACE)) { + break; + } else { + next(TOK_STRING); + if (seenComma) + lexemAfterComma += symbol().lexem; + else + lexemBeforeComma += symbol().lexem; + } + } + bool isNumber = false; + int min = lexemBeforeComma.toInt(&isNumber); + if (!isNumber) { + index = rewind; + return nfa; + } + int max = min; + if (seenComma) { + max = lexemAfterComma.toInt(&isNumber); + if (!isNumber) { + errorColumn = symbol().column; + return NFA(); + } + } + return NFA::applyQuantity(nfa, min, max); + } + default: + prev(); + return nfa; + } +} + +NFA RE2NFA::parseSet() +{ + QSet<InputType> set; + bool negate = false; + + next(TOK_STRING); + + do { + Q_ASSERT(symbol().lexem.length() == 1); + // ### + QChar ch = symbol().lexem.at(0); + if (set.isEmpty() && ch == QLatin1Char('^')) { + negate = true; + continue; + } + + // look ahead for ranges like a-z + bool rangeFound = false; + if (test(TOK_STRING)) { + if (symbol().lexem.length() == 1 + && symbol().lexem.at(0) == QLatin1Char('-')) { + next(TOK_STRING); + Q_ASSERT(symbol().lexem.length() == 1); + QChar last = symbol().lexem.at(0); + + if (ch.unicode() > last.unicode()) + qSwap(ch, last); + + for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { + if (caseSensitivity == Qt::CaseInsensitive) { + set.insert(QChar(i).toLower().unicode()); + } else { + set.insert(i); + } + } + + rangeFound = true; + } else { + prev(); + } + } + + if (!rangeFound) { + if (caseSensitivity == Qt::CaseInsensitive) { + set.insert(ch.toLower().unicode()); + } else { + set.insert(ch.unicode()); + } + } + } while (test(TOK_STRING)); + + if (negate) { + QSet<InputType> negatedSet = maxInputSet; + negatedSet.subtract(set); + set = negatedSet; + } + + return NFA::createSetNFA(set); +} + +NFA RE2NFA::parseSet2() +{ + QSet<InputType> set; + bool negate = false; + + QString str = symbol().lexem; + // strip off brackets + str.chop(1); + str.remove(0, 1); + + int i = 0; + while (i < str.length()) { + // ### + QChar ch = str.at(i++); + if (set.isEmpty() && ch == QLatin1Char('^')) { + negate = true; + continue; + } + + // look ahead for ranges like a-z + bool rangeFound = false; + if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) { + ++i; + QChar last = str.at(i++); + + if (ch.unicode() > last.unicode()) + qSwap(ch, last); + + for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { + if (caseSensitivity == Qt::CaseInsensitive) { + set.insert(QChar(i).toLower().unicode()); + } else { + set.insert(i); + } + } + + rangeFound = true; + } + + if (!rangeFound) { + if (caseSensitivity == Qt::CaseInsensitive) { + set.insert(ch.toLower().unicode()); + } else { + set.insert(ch.unicode()); + } + } + } + + if (negate) { + QSet<InputType> negatedSet = maxInputSet; + negatedSet.subtract(set); + set = negatedSet; + } + + return NFA::createSetNFA(set); +} +NFA RE2NFA::createCharNFA() +{ + NFA nfa; + // #### + if (caseSensitivity == Qt::CaseInsensitive) { + nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1()); + } else { + nfa = NFA::createStringNFA(symbol().lexem.toLatin1()); + } + return nfa; +} + +static inline int skipQuote(const QString &str, int pos) +{ + while (pos < str.length() + && str.at(pos) != QLatin1Char('"')) { + if (str.at(pos) == QLatin1Char('\\')) { + ++pos; + if (pos >= str.length()) + break; + } + ++pos; + } + if (pos < str.length()) + ++pos; + return pos; +} + +#if 0 +static const char*tokStr(Token t) +{ + switch (t) { + case TOK_INVALID: return "TOK_INVALID"; + case TOK_STRING: return "TOK_STRING"; + case TOK_LBRACE: return "TOK_LBRACE"; + case TOK_RBRACE: return "TOK_RBRACE"; + case TOK_LBRACKET: return "TOK_LBRACKET"; + case TOK_RBRACKET: return "TOK_RBRACKET"; + case TOK_LPAREN: return "TOK_LPAREN"; + case TOK_RPAREN: return "TOK_RPAREN"; + case TOK_COMMA: return "TOK_COMMA"; + case TOK_STAR: return "TOK_STAR"; + case TOK_OR: return "TOK_OR"; + case TOK_QUESTION: return "TOK_QUESTION"; + case TOK_DOT: return "TOK_DOT"; + case TOK_PLUS: return "TOK_PLUS"; + case TOK_SEQUENCE: return "TOK_SEQUENCE"; + case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING"; + } + return ""; +} +#endif + +void RE2NFA::tokenize(const QString &input) +{ + symbols.clear(); +#if 1 + RegExpTokenizer tokenizer(input); + Symbol sym; + int tok = tokenizer.lex(); + while (tok != -1) { + Symbol sym; + sym.token = static_cast<Token>(tok); + sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength); + + if (sym.token == TOK_QUOTED_STRING) { + sym.lexem.chop(1); + sym.lexem.remove(0, 1); + sym.token = TOK_STRING; + } + + if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) { + for (int i = 0; i < sym.lexem.length(); ++i) { + if (sym.lexem.at(i) == '\\') { + if (i >= sym.lexem.length() - 1) + break; + QChar ch = sym.lexem.at(i + 1); + if (ch == QLatin1Char('n')) { + ch = '\n'; + } else if (ch == QLatin1Char('r')) { + ch = '\r'; + } else if (ch == QLatin1Char('t')) { + ch = '\t'; + } else if (ch == QLatin1Char('f')) { + ch = '\f'; + } + sym.lexem.replace(i, 2, ch); + } + } + } + + /* + if (sym.token == TOK_SEQUENCE) { + Symbol s; + s.token = TOK_LBRACKET; + s.lexem = "["; + symbols.append(s); + + for (int i = 1; i < sym.lexem.length() - 1; ++i) { + s.token = TOK_STRING; + s.lexem = sym.lexem.at(i); + symbols.append(s); + } + + s.token = TOK_RBRACKET; + s.lexem = "]"; + symbols.append(s); + + tok = tokenizer.lex(); + continue; + } + */ + + symbols.append(sym); + tok = tokenizer.lex(); + } +#else + int pos = 0; + bool insideSet = false; + while (pos < input.length()) { + QChar ch = input.at(pos); + + Symbol sym; + sym.column = pos; + sym.token = TOK_INVALID; + sym.lexem = QString(ch); + switch (ch.toLatin1()) { + case '"': { + if (insideSet) { + sym.token = TOK_STRING; + sym.lexem = QString(ch); + symbols += sym; + ++pos; + continue; + } + if (pos + 1 >= input.length()) + return; + int quoteEnd = skipQuote(input, pos + 1); + sym.token = TOK_STRING; + sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2); + symbols += sym; + pos = quoteEnd; + continue; + } + case '{': + sym.token = (insideSet ? TOK_STRING : TOK_LBRACE); + break; + case '}': + sym.token = (insideSet ? TOK_STRING : TOK_RBRACE); + break; + case '[': + insideSet = true; + sym.token = TOK_LBRACKET; + break; + case ']': + insideSet = false; + sym.token = TOK_RBRACKET; + break; + case '(': + sym.token = (insideSet ? TOK_STRING : TOK_LPAREN); + break; + case ')': + sym.token = (insideSet ? TOK_STRING : TOK_RPAREN); + break; + case ',': + sym.token = (insideSet ? TOK_STRING : TOK_COMMA); + break; + case '*': + sym.token = (insideSet ? TOK_STRING : TOK_STAR); + break; + case '|': + sym.token = (insideSet ? TOK_STRING : TOK_OR); + break; + case '?': + sym.token = (insideSet ? TOK_STRING : TOK_QUESTION); + break; + case '.': + sym.token = (insideSet ? TOK_STRING : TOK_DOT); + break; + case '+': + sym.token = (insideSet ? TOK_STRING : TOK_PLUS); + break; + case '\\': + ++pos; + if (pos >= input.length()) + return; + ch = input.at(pos); + if (ch == QLatin1Char('n')) { + ch = '\n'; + } else if (ch == QLatin1Char('r')) { + ch = '\r'; + } else if (ch == QLatin1Char('t')) { + ch = '\t'; + } else if (ch == QLatin1Char('f')) { + ch = '\f'; + } + // fall through + default: + sym.token = TOK_STRING; + sym.lexem = QString(ch); + symbols += sym; + ++pos; + continue; + } + symbols += sym; + ++pos; + } +#endif +#if 0 + foreach (Symbol s, symbols) { + qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem; + } +#endif +} + +bool RE2NFA::next(Token t) +{ + if (hasNext() && next() == t) + return true; + errorColumn = symbol().column; + Q_ASSERT(false); + return false; +} + +bool RE2NFA::test(Token t) +{ + if (index >= symbols.count()) + return false; + if (symbols.at(index).token == t) { + ++index; + return true; + } + return false; +} + +QString RE2NFA::lexemUntil(Token t) +{ + QString lexem; + while (hasNext() && next() != t) + lexem += symbol().lexem; + return lexem; +} + diff --git a/util/lexgen/re2nfa.h b/util/lexgen/re2nfa.h new file mode 100644 index 0000000..fc10bea --- /dev/null +++ b/util/lexgen/re2nfa.h @@ -0,0 +1,116 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef RE2NFA_H +#define RE2NFA_H + +#include "nfa.h" +#include <QSet> + +class RE2NFA +{ +public: + RE2NFA(const QMap<QString, NFA> ¯os, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs); + + NFA parse(const QString &expression, int *errorColumn = 0); + +private: + NFA parseExpr(); + NFA parseBranch(); + NFA parsePiece(); + NFA parseAtom(); + NFA parseMaybeQuantifier(const NFA &nfa); + NFA parseSet(); + NFA parseSet2(); + + NFA createCharNFA(); + +private: + friend class RegExpTokenizer; + + enum Token { + TOK_INVALID, + TOK_STRING, + TOK_LBRACE, // { + TOK_RBRACE, // } + TOK_LBRACKET, // [ + TOK_RBRACKET, // ] + TOK_LPAREN, // ( + TOK_RPAREN, // ) + TOK_COMMA, + TOK_STAR, + TOK_OR, + TOK_QUESTION, + TOK_DOT, + TOK_PLUS, + TOK_SEQUENCE, + TOK_QUOTED_STRING + }; + + struct Symbol + { + inline Symbol() : token(TOK_INVALID), column(-1) {} + inline Symbol(Token t, const QString &l = QString()) : token(t), lexem(l), column(-1) {} + Token token; + QString lexem; + int column; + }; + + inline bool hasNext() const { return index < symbols.count(); } + inline Token next() { return symbols.at(index++).token; } + bool next(Token t); + bool test(Token t); + inline void prev() { index--; } + inline const Symbol &symbol() const { return symbols.at(index - 1); } + QString lexemUntil(Token t); + + void tokenize(const QString &input); + + QMap<QString, NFA> macros; + QVector<Symbol> symbols; + int index; + int errorColumn; + const QSet<InputType> maxInputSet; + Qt::CaseSensitivity caseSensitivity; +}; + +#endif // RE2NFA_H + diff --git a/util/lexgen/test.lexgen b/util/lexgen/test.lexgen new file mode 100644 index 0000000..fd532fd --- /dev/null +++ b/util/lexgen/test.lexgen @@ -0,0 +1,9 @@ +[Options] +case-insensitive +classname = TestScanner + +[Tokens] +TOK_C = [abcd] +TOK_B = [bc] +TOK_A = a + diff --git a/util/lexgen/tests/testdata/backtrack1/input b/util/lexgen/tests/testdata/backtrack1/input new file mode 100644 index 0000000..f5099b5 --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack1/input @@ -0,0 +1 @@ +LETX diff --git a/util/lexgen/tests/testdata/backtrack1/output b/util/lexgen/tests/testdata/backtrack1/output new file mode 100644 index 0000000..6893deb --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack1/output @@ -0,0 +1 @@ +TOK_LET|LET diff --git a/util/lexgen/tests/testdata/backtrack1/rules.lexgen b/util/lexgen/tests/testdata/backtrack1/rules.lexgen new file mode 100644 index 0000000..ade8a15 --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack1/rules.lexgen @@ -0,0 +1,3 @@ +[Tokens] +TOK_LET = LET +TOK_LETXX = LETXX diff --git a/util/lexgen/tests/testdata/backtrack2/input b/util/lexgen/tests/testdata/backtrack2/input new file mode 100644 index 0000000..59ff5b7 --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack2/input @@ -0,0 +1 @@ +LETXTRA diff --git a/util/lexgen/tests/testdata/backtrack2/output b/util/lexgen/tests/testdata/backtrack2/output new file mode 100644 index 0000000..348b382 --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack2/output @@ -0,0 +1,2 @@ +TOK_LET|LET +TOK_XTRA|XTRA diff --git a/util/lexgen/tests/testdata/backtrack2/rules.lexgen b/util/lexgen/tests/testdata/backtrack2/rules.lexgen new file mode 100644 index 0000000..6f16986 --- /dev/null +++ b/util/lexgen/tests/testdata/backtrack2/rules.lexgen @@ -0,0 +1,4 @@ +[Tokens] +TOK_LET = LET +TOK_LETXX = LETXX +TOK_XTRA = XTRA diff --git a/util/lexgen/tests/testdata/casesensitivity/input b/util/lexgen/tests/testdata/casesensitivity/input new file mode 100644 index 0000000..72b7f48 --- /dev/null +++ b/util/lexgen/tests/testdata/casesensitivity/input @@ -0,0 +1 @@ +abcdAbcDABCDeFgEFGefgEfghiHIHihI diff --git a/util/lexgen/tests/testdata/casesensitivity/output b/util/lexgen/tests/testdata/casesensitivity/output new file mode 100644 index 0000000..3a4e819 --- /dev/null +++ b/util/lexgen/tests/testdata/casesensitivity/output @@ -0,0 +1,14 @@ +TOK_AB|ab +TOK_CD|cd +TOK_AB|Ab +TOK_CD|cD +TOK_AB|AB +TOK_CD|CD +TOK_EFG|eFg +TOK_EFG|EFG +TOK_EFG|efg +TOK_EFG|Efg +TOK_HI|hi +TOK_HI|HI +TOK_HI|Hi +TOK_HI|hI diff --git a/util/lexgen/tests/testdata/casesensitivity/rules.lexgen b/util/lexgen/tests/testdata/casesensitivity/rules.lexgen new file mode 100644 index 0000000..3347587 --- /dev/null +++ b/util/lexgen/tests/testdata/casesensitivity/rules.lexgen @@ -0,0 +1,7 @@ +[Options] +case-insensitive +[Tokens] +TOK_AB = ab +TOK_CD = cd +TOK_EFG = [e-g]{3} +TOK_HI = [hi]{2} diff --git a/util/lexgen/tests/testdata/comments/input b/util/lexgen/tests/testdata/comments/input new file mode 100644 index 0000000..03873e0 --- /dev/null +++ b/util/lexgen/tests/testdata/comments/input @@ -0,0 +1 @@ +/* comment with stuff *//*another comment with * stars * inside*/ diff --git a/util/lexgen/tests/testdata/comments/output b/util/lexgen/tests/testdata/comments/output new file mode 100644 index 0000000..2395ad1 --- /dev/null +++ b/util/lexgen/tests/testdata/comments/output @@ -0,0 +1,2 @@ +TOK_COMMENT|/* comment with stuff */ +TOK_COMMENT|/*another comment with * stars * inside*/ diff --git a/util/lexgen/tests/testdata/comments/rules.lexgen b/util/lexgen/tests/testdata/comments/rules.lexgen new file mode 100644 index 0000000..490c759 --- /dev/null +++ b/util/lexgen/tests/testdata/comments/rules.lexgen @@ -0,0 +1,2 @@ +[Tokens] +TOK_COMMENT = \/\*[^*]*\*+([^/*][^*]*\*+)*\/ diff --git a/util/lexgen/tests/testdata/dot/input b/util/lexgen/tests/testdata/dot/input new file mode 100644 index 0000000..e5b0ad6 --- /dev/null +++ b/util/lexgen/tests/testdata/dot/input @@ -0,0 +1 @@ +afbcxd diff --git a/util/lexgen/tests/testdata/dot/output b/util/lexgen/tests/testdata/dot/output new file mode 100644 index 0000000..6a9afd4 --- /dev/null +++ b/util/lexgen/tests/testdata/dot/output @@ -0,0 +1,2 @@ +TOK_AB|afb +TOK_CD|cxd diff --git a/util/lexgen/tests/testdata/dot/rules.lexgen b/util/lexgen/tests/testdata/dot/rules.lexgen new file mode 100644 index 0000000..03873a7 --- /dev/null +++ b/util/lexgen/tests/testdata/dot/rules.lexgen @@ -0,0 +1,3 @@ +[Tokens] +TOK_AB = a.b +TOK_CD = c.d diff --git a/util/lexgen/tests/testdata/negation/input b/util/lexgen/tests/testdata/negation/input new file mode 100644 index 0000000..9447b80 --- /dev/null +++ b/util/lexgen/tests/testdata/negation/input @@ -0,0 +1 @@ +aycabd diff --git a/util/lexgen/tests/testdata/negation/output b/util/lexgen/tests/testdata/negation/output new file mode 100644 index 0000000..0b73263 --- /dev/null +++ b/util/lexgen/tests/testdata/negation/output @@ -0,0 +1,2 @@ +TOK_A|ayc +TOK_B|abd diff --git a/util/lexgen/tests/testdata/negation/rules.lexgen b/util/lexgen/tests/testdata/negation/rules.lexgen new file mode 100644 index 0000000..179810b --- /dev/null +++ b/util/lexgen/tests/testdata/negation/rules.lexgen @@ -0,0 +1,3 @@ +[Tokens] +TOK_A = a[^b]c +TOK_B = abd diff --git a/util/lexgen/tests/testdata/quoteinset/input b/util/lexgen/tests/testdata/quoteinset/input new file mode 100644 index 0000000..5a9b680 --- /dev/null +++ b/util/lexgen/tests/testdata/quoteinset/input @@ -0,0 +1 @@ +"a diff --git a/util/lexgen/tests/testdata/quoteinset/output b/util/lexgen/tests/testdata/quoteinset/output new file mode 100644 index 0000000..7ba8890 --- /dev/null +++ b/util/lexgen/tests/testdata/quoteinset/output @@ -0,0 +1 @@ +TOK_QUOTEA|"a diff --git a/util/lexgen/tests/testdata/quoteinset/rules.lexgen b/util/lexgen/tests/testdata/quoteinset/rules.lexgen new file mode 100644 index 0000000..9838276 --- /dev/null +++ b/util/lexgen/tests/testdata/quoteinset/rules.lexgen @@ -0,0 +1,2 @@ +[Tokens] +TOK_QUOTEA = ["]a diff --git a/util/lexgen/tests/testdata/quotes/input b/util/lexgen/tests/testdata/quotes/input new file mode 100644 index 0000000..ac54450 --- /dev/null +++ b/util/lexgen/tests/testdata/quotes/input @@ -0,0 +1 @@ +quotedstring diff --git a/util/lexgen/tests/testdata/quotes/output b/util/lexgen/tests/testdata/quotes/output new file mode 100644 index 0000000..c538e32 --- /dev/null +++ b/util/lexgen/tests/testdata/quotes/output @@ -0,0 +1 @@ +TOK_STR|quotedstring diff --git a/util/lexgen/tests/testdata/quotes/rules.lexgen b/util/lexgen/tests/testdata/quotes/rules.lexgen new file mode 100644 index 0000000..d528cdd --- /dev/null +++ b/util/lexgen/tests/testdata/quotes/rules.lexgen @@ -0,0 +1,2 @@ +[Tokens] +TOK_STR = "quotedstring" diff --git a/util/lexgen/tests/testdata/simple/input b/util/lexgen/tests/testdata/simple/input new file mode 100644 index 0000000..acbe86c --- /dev/null +++ b/util/lexgen/tests/testdata/simple/input @@ -0,0 +1 @@ +abcd diff --git a/util/lexgen/tests/testdata/simple/output b/util/lexgen/tests/testdata/simple/output new file mode 100644 index 0000000..a37a58e --- /dev/null +++ b/util/lexgen/tests/testdata/simple/output @@ -0,0 +1,2 @@ +TOK_AB|ab +TOK_CD|cd diff --git a/util/lexgen/tests/testdata/simple/rules.lexgen b/util/lexgen/tests/testdata/simple/rules.lexgen new file mode 100644 index 0000000..5d958c4 --- /dev/null +++ b/util/lexgen/tests/testdata/simple/rules.lexgen @@ -0,0 +1,3 @@ +[Tokens] +TOK_AB = ab +TOK_CD = cd diff --git a/util/lexgen/tests/testdata/subsets1/input b/util/lexgen/tests/testdata/subsets1/input new file mode 100644 index 0000000..47f66d1 --- /dev/null +++ b/util/lexgen/tests/testdata/subsets1/input @@ -0,0 +1 @@ +abaf diff --git a/util/lexgen/tests/testdata/subsets1/output b/util/lexgen/tests/testdata/subsets1/output new file mode 100644 index 0000000..75dd936 --- /dev/null +++ b/util/lexgen/tests/testdata/subsets1/output @@ -0,0 +1,2 @@ +TOK_AB|ab +TOK_AZ|af diff --git a/util/lexgen/tests/testdata/subsets1/rules.lexgen b/util/lexgen/tests/testdata/subsets1/rules.lexgen new file mode 100644 index 0000000..94f51a9 --- /dev/null +++ b/util/lexgen/tests/testdata/subsets1/rules.lexgen @@ -0,0 +1,3 @@ +[Tokens] +TOK_AB = ab +TOK_AZ = a[a-z] diff --git a/util/lexgen/tests/testdata/subsets2/input b/util/lexgen/tests/testdata/subsets2/input new file mode 100644 index 0000000..4d0dc3a --- /dev/null +++ b/util/lexgen/tests/testdata/subsets2/input @@ -0,0 +1 @@ +abd diff --git a/util/lexgen/tests/testdata/subsets2/output b/util/lexgen/tests/testdata/subsets2/output new file mode 100644 index 0000000..d5a7bc5 --- /dev/null +++ b/util/lexgen/tests/testdata/subsets2/output @@ -0,0 +1,3 @@ +TOK_A|a +TOK_B|b +TOK_D|d diff --git a/util/lexgen/tests/testdata/subsets2/rules.lexgen b/util/lexgen/tests/testdata/subsets2/rules.lexgen new file mode 100644 index 0000000..e0a7629 --- /dev/null +++ b/util/lexgen/tests/testdata/subsets2/rules.lexgen @@ -0,0 +1,4 @@ +[Tokens] +TOK_D = [abcd] +TOK_B = [bc] +TOK_A = a diff --git a/util/lexgen/tests/tests.pro b/util/lexgen/tests/tests.pro new file mode 100644 index 0000000..eb04439 --- /dev/null +++ b/util/lexgen/tests/tests.pro @@ -0,0 +1,6 @@ +CONFIG += qtestlib +SOURCES += tst_lexgen.cpp +TARGET = tst_lexgen +include(../lexgen.pri) +QT = core +DEFINES += SRCDIR=\\\"$$PWD\\\" diff --git a/util/lexgen/tests/tst_lexgen.cpp b/util/lexgen/tests/tst_lexgen.cpp new file mode 100644 index 0000000..6e50b15 --- /dev/null +++ b/util/lexgen/tests/tst_lexgen.cpp @@ -0,0 +1,285 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include <QtTest/QtTest> +#define AUTOTEST +#include "../main.cpp" + +class tst_LexGen : public QObject +{ + Q_OBJECT +private slots: + void nfa_singleInput(); + void nfa_alternating(); + void nfa_concatenating(); + void nfa_optional(); + void nfa_toDFA_data(); + void nfa_toDFA(); + void lexgen_data(); + void lexgen(); +}; + +void tst_LexGen::nfa_singleInput() +{ + NFA nfa = NFA::createSingleInputNFA('a'); + + QCOMPARE(nfa.initialState, 0); + QCOMPARE(nfa.finalState, 1); + + QCOMPARE(nfa.states.count(), 2); + + QCOMPARE(nfa.states.at(0).transitions.count(), 1); + QVERIFY(nfa.states.at(0).transitions.contains('a')); + QCOMPARE(nfa.states.at(0).transitions.values('a').count(), 1); + QCOMPARE(nfa.states.at(0).transitions.value('a'), nfa.finalState); + + QVERIFY(nfa.states.at(1).transitions.isEmpty()); +} + +void tst_LexGen::nfa_alternating() +{ + NFA a = NFA::createSingleInputNFA('a'); + NFA b = NFA::createSingleInputNFA('b'); + NFA nfa = NFA::createAlternatingNFA(a, b); + + const int initialA = 1; + const int finalA = 2; + + const int initialB = 3; + const int finalB = 4; + + QCOMPARE(nfa.states.count(), 6); + + QCOMPARE(nfa.initialState, 0); + QCOMPARE(nfa.finalState, 5); + + QList<int> initialTransitions = nfa.states.at(0).transitions.values(Epsilon); + QCOMPARE(initialTransitions.count(), 2); + QVERIFY(initialTransitions.contains(initialA)); + QVERIFY(initialTransitions.contains(initialB)); + + // no need to test the individual a and b NFAs, the other + // autotest already takes care of that. Just check whether + // the epsilon transitions to the final state exist. + + QCOMPARE(nfa.states.at(finalA).transitions.count(), 1); + QCOMPARE(nfa.states.at(finalA).transitions.values(Epsilon).count(), 1); + QCOMPARE(nfa.states.at(finalA).transitions.value(Epsilon), nfa.finalState); + + QCOMPARE(nfa.states.at(finalB).transitions.count(), 1); + QCOMPARE(nfa.states.at(finalB).transitions.values(Epsilon).count(), 1); + QCOMPARE(nfa.states.at(finalB).transitions.value(Epsilon), nfa.finalState); +} + +void tst_LexGen::nfa_concatenating() +{ + NFA a = NFA::createSingleInputNFA('a'); + NFA b = NFA::createSingleInputNFA('b'); + NFA nfa = NFA::createConcatenatingNFA(a, b); + + const int initialA = 1; + const int finalA = 2; + + const int initialB = 3; + const int finalB = 4; + + QCOMPARE(nfa.states.count(), 6); + + QCOMPARE(nfa.initialState, 0); + QCOMPARE(nfa.finalState, 5); + + QCOMPARE(nfa.states.at(0).transitions.count(), 1); + QCOMPARE(nfa.states.at(0).transitions.values(Epsilon).count(), 1); + QCOMPARE(nfa.states.at(0).transitions.value(Epsilon), initialA); + + QCOMPARE(nfa.states.at(finalA).transitions.values(Epsilon).count(), 1); + QCOMPARE(nfa.states.at(finalA).transitions.value(Epsilon), initialB); + + QCOMPARE(nfa.states.at(finalB).transitions.values(Epsilon).count(), 1); + QCOMPARE(nfa.states.at(finalB).transitions.value(Epsilon), nfa.finalState); +} + +void tst_LexGen::nfa_optional() +{ + NFA a = NFA::createSingleInputNFA('a'); + NFA nfa = NFA::createOptionalNFA(a); + + const int initialA = 1; + const int finalA = 2; + + QCOMPARE(nfa.states.count(), 4); + + QCOMPARE(nfa.initialState, 0); + QCOMPARE(nfa.finalState, 3); + + QCOMPARE(nfa.states.at(0).transitions.count(), 2); + QList<int> initialTransitions = nfa.states.at(0).transitions.values(Epsilon); + QVERIFY(initialTransitions.contains(nfa.finalState)); + QVERIFY(initialTransitions.contains(initialA)); + + QList<int> finalEpsilonATransitions = nfa.states.at(finalA).transitions.values(Epsilon); + QVERIFY(finalEpsilonATransitions.contains(initialA)); + QVERIFY(finalEpsilonATransitions.contains(nfa.finalState)); +} + +Q_DECLARE_METATYPE(NFA); +Q_DECLARE_METATYPE(DFA); + +void tst_LexGen::nfa_toDFA_data() +{ + QTest::addColumn<NFA>("nfa"); + QTest::addColumn<DFA>("expectedDFA"); + + NFA a = NFA::createSingleInputNFA('a'); + NFA b = NFA::createSingleInputNFA('b'); + NFA c = NFA::createSingleInputNFA('c'); + + NFA nfa; + DFA dfa; + + dfa.clear(); + dfa.resize(3); + dfa[0].transitions.insert('a', 1); + dfa[1].transitions.insert('b', 2); + + nfa = NFA::createConcatenatingNFA(a, b); + + QTest::newRow("simple concat") << nfa << dfa; + + dfa.clear(); + dfa.resize(3); + dfa[0].transitions.insert('a', 1); + dfa[0].transitions.insert('b', 2); + + nfa = NFA::createAlternatingNFA(a, b); + + QTest::newRow("simple alternate") << nfa << dfa; + +} + +void tst_LexGen::nfa_toDFA() +{ + QFETCH(NFA, nfa); + QFETCH(DFA, expectedDFA); + + DFA dfa = nfa.toDFA(); + + QCOMPARE(dfa.count(), expectedDFA.count()); + for (int i = 0; i < dfa.count(); ++i) { + if (dfa.at(i).transitions != expectedDFA.at(i).transitions) { + qDebug() << "DFAs differ in state" << i; + qDebug() << "NFA:"; + nfa.debug(); + qDebug() << "Actual DFA:"; + dfa.debug(); + qDebug() << "Expected DFA:"; + expectedDFA.debug(); + QVERIFY(false); + } + } +} + +void tst_LexGen::lexgen_data() +{ + QTest::addColumn<QString>("ruleFile"); + QTest::addColumn<QString>("input"); + QTest::addColumn<QString>("expectedOutput"); + + QDir d(QString(SRCDIR)); + d.cd("testdata"); + foreach (QString test, d.entryList(QDir::Dirs | QDir::NoDotAndDotDot)) { + QString dir = d.absoluteFilePath(test) + '/'; + QTest::newRow(qPrintable(test)) + << dir + "rules.lexgen" + << dir + "input" + << dir + "output" + ; + } +} + +void tst_LexGen::lexgen() +{ + QFETCH(QString, ruleFile); + QFETCH(QString, input); + QFETCH(QString, expectedOutput); + + Config conf; + QVERIFY(loadConfig(ruleFile, &conf)); + DFA dfa = generateMachine(conf); + QVERIFY(!dfa.isEmpty()); + conf.debug = true; + + QFile f(input); + QVERIFY(f.open(QIODevice::ReadOnly)); + input = QString::fromUtf8(f.readAll()); + f.close(); + if (input.endsWith(QLatin1Char('\n'))) + input.chop(1); +// machine.debug(); + bool ok = false; + QList<Symbol> symbols = tokenize(dfa, input, &conf, &ok); + QVERIFY(ok); + f.setFileName(expectedOutput); + QVERIFY(f.open(QIODevice::ReadOnly)); + QStringList lines; + while (!f.atEnd()) { + QString line = QString::fromUtf8(f.readLine()); + if (line.endsWith(QLatin1Char('\n'))) + line.chop(1); + lines << line; + } + f.close(); + +// dfa.debug(); + QCOMPARE(lines.count(), symbols.count()); + + for (int i = 0; i < lines.count(); ++i) { + QStringList l = lines.at(i).split(QChar::fromLatin1('|')); + QCOMPARE(l.count(), 2); + QString expectedToken = l.at(0); + QString expectedLexem = l.at(1); + QCOMPARE(symbols.at(i).token, expectedToken); + QCOMPARE(symbols.at(i).lexem, expectedLexem); + } +} + +QTEST_MAIN(tst_LexGen) +#include "tst_lexgen.moc" diff --git a/util/lexgen/tokenizer.cpp b/util/lexgen/tokenizer.cpp new file mode 100644 index 0000000..401fa92 --- /dev/null +++ b/util/lexgen/tokenizer.cpp @@ -0,0 +1,237 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +// auto generated. DO NOT EDIT. +class RegExpTokenizer +{ +public: + RegExpTokenizer(const QString &inp); + + inline QChar next() { + return (pos < input.length()) ? input.at(pos++) : QChar(); + } + int lex(); + + QString input; + int pos; + int lexemStart; + int lexemLength; +}; + +RegExpTokenizer::RegExpTokenizer(const QString &inp) +{ + input = inp; + pos = 0; + lexemStart = 0; + lexemLength = 0; +} + + +int RegExpTokenizer::lex() +{ + lexemStart = pos; + lexemLength = 0; + int lastAcceptingPos = -1; + int token = -1; + QChar ch; + + // initial state + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 33) + goto state_1; + if (ch.unicode() == 34) + goto state_2; + if (ch.unicode() >= 35 && ch.unicode() <= 39) + goto state_1; + if (ch.unicode() == 40) { + token = RE2NFA::TOK_LPAREN; + goto found; + } + if (ch.unicode() == 41) { + token = RE2NFA::TOK_RPAREN; + goto found; + } + if (ch.unicode() == 42) { + token = RE2NFA::TOK_STAR; + goto found; + } + if (ch.unicode() == 43) { + token = RE2NFA::TOK_PLUS; + goto found; + } + if (ch.unicode() == 44) { + token = RE2NFA::TOK_COMMA; + goto found; + } + if (ch.unicode() == 45) + goto state_1; + if (ch.unicode() == 46) { + token = RE2NFA::TOK_DOT; + goto found; + } + if (ch.unicode() >= 47 && ch.unicode() <= 62) + goto state_1; + if (ch.unicode() == 63) { + token = RE2NFA::TOK_QUESTION; + goto found; + } + if (ch.unicode() >= 64 && ch.unicode() <= 90) + goto state_1; + if (ch.unicode() == 91) + goto state_10; + if (ch.unicode() == 92) + goto state_11; + if (ch.unicode() >= 93 && ch.unicode() <= 122) + goto state_1; + if (ch.unicode() == 123) { + token = RE2NFA::TOK_LBRACE; + goto found; + } + if (ch.unicode() == 124) { + token = RE2NFA::TOK_OR; + goto found; + } + if (ch.unicode() == 125) { + token = RE2NFA::TOK_RBRACE; + goto found; + } + if (ch.unicode() >= 126) + goto state_1; + goto out; + state_1: + lastAcceptingPos = pos; + token = RE2NFA::TOK_STRING; + goto out; + state_2: + lastAcceptingPos = pos; + token = RE2NFA::TOK_STRING; + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 33) + goto state_15; + if (ch.unicode() == 34) + goto state_16; + if (ch.unicode() >= 35) + goto state_15; + goto out; + state_10: + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 91) + goto state_17; + if (ch.unicode() == 92) + goto state_18; + if (ch.unicode() == 93) + goto state_19; + if (ch.unicode() >= 94) + goto state_17; + goto out; + state_11: + lastAcceptingPos = pos; + token = RE2NFA::TOK_STRING; + ch = next(); + if (ch.unicode() >= 1) + goto state_20; + goto out; + state_15: + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 33) + goto state_15; + if (ch.unicode() == 34) + goto state_16; + if (ch.unicode() >= 35) + goto state_15; + goto out; + state_16: + lastAcceptingPos = pos; + token = RE2NFA::TOK_QUOTED_STRING; + goto out; + state_17: + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 91) + goto state_17; + if (ch.unicode() == 92) + goto state_18; + if (ch.unicode() == 93) + goto state_19; + if (ch.unicode() >= 94) + goto state_17; + goto out; + state_18: + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 91) + goto state_17; + if (ch.unicode() == 92) + goto state_18; + if (ch.unicode() == 93) + goto state_21; + if (ch.unicode() >= 94) + goto state_17; + goto out; + state_19: + lastAcceptingPos = pos; + token = RE2NFA::TOK_SEQUENCE; + goto out; + state_20: + lastAcceptingPos = pos; + token = RE2NFA::TOK_STRING; + goto out; + state_21: + lastAcceptingPos = pos; + token = RE2NFA::TOK_SEQUENCE; + ch = next(); + if (ch.unicode() >= 1 && ch.unicode() <= 91) + goto state_17; + if (ch.unicode() == 92) + goto state_18; + if (ch.unicode() == 93) + goto state_19; + if (ch.unicode() >= 94) + goto state_17; + goto out; + found: + lastAcceptingPos = pos; + + out: + if (lastAcceptingPos != -1) { + lexemLength = lastAcceptingPos - lexemStart; + pos = lastAcceptingPos; + } + return token; +} + |