From 644997f85f327d3f8456f1c5cf0cde78a3bd1d28 Mon Sep 17 00:00:00 2001 From: Dimitri van Heesch Date: Sat, 5 Dec 2020 11:46:11 +0100 Subject: Refactoring: replace QXml by own XML processor --- src/CMakeLists.txt | 3 + src/xml.h | 64 +++++++ src/xml.l | 481 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 548 insertions(+) create mode 100644 src/xml.h create mode 100644 src/xml.l diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4f4d132..912c67e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -122,6 +122,7 @@ set(LEX_FILES scanner commentcnv commentscan constexp + xml xmlcode sqlcode configimpl) @@ -197,6 +198,7 @@ add_library(doxymain STATIC ${GENERATED_SRC}/scanner.l.h ${GENERATED_SRC}/sqlcode.l.h ${GENERATED_SRC}/vhdlcode.l.h + ${GENERATED_SRC}/xml.l.h ${GENERATED_SRC}/xmlcode.l.h ${GENERATED_SRC}/code.cpp ${GENERATED_SRC}/commentcnv.cpp @@ -212,6 +214,7 @@ add_library(doxymain STATIC ${GENERATED_SRC}/scanner.cpp ${GENERATED_SRC}/sqlcode.cpp ${GENERATED_SRC}/vhdlcode.cpp + ${GENERATED_SRC}/xml.cpp ${GENERATED_SRC}/xmlcode.cpp # ${GENERATED_SRC}/ce_parse.cpp diff --git a/src/xml.h b/src/xml.h new file mode 100644 index 0000000..6ba36b2 --- /dev/null +++ b/src/xml.h @@ -0,0 +1,64 @@ +#ifndef XML_H +#define XML_H + +#include +#include +#include +#include + +/*! @brief Event handlers that can installed by the client and called while parsing a XML document. + */ +class XMLHandlers +{ + public: + using Attributes = std::unordered_map; + using StartDocType = void(); + using EndDocType = void(); + using StartElementType = void(const std::string &,const Attributes &); + using EndElementType = void(const std::string &); + using ErrorType = void(const std::string,int,const std::string &); + using CharsType = void(const std::string &); + + std::function startDocument; /**< handler invoked at the start of the document */ + std::function endDocument; /**< handler invoked at the end of the document */ + std::function startElement; /**< handler invoked when an opening tag has been found */ + std::function endElement; /**< handler invoked when a closing tag has been found */ + std::function characters; /**< handler invoked when content between tags has been found */ + std::function error; /**< handler invoked when the parser encounters an error */ +}; + +class XMLLocator +{ + public: + virtual ~XMLLocator() {} + virtual int lineNr() const = 0; + virtual std::string fileName() const = 0; +}; + +/*! Very basic SAX style parser to parse XML documents. */ +class XMLParser : public XMLLocator +{ + public: + /*! Creates an instance of the parser object. Different instances can run on different + * threads without interference. + * + * @param handlers The event handlers passed by the client. + */ + XMLParser(const XMLHandlers &handlers); + /*! Destructor */ + ~XMLParser(); + + /*! Parses a file gives the contents of the file as a string. + * @param fileName the name of the file, used for error reporting. + * @param inputString the contents of the file as a zero terminated UTF-8 string. + */ + void parse(const char *fileName,const char *inputString); + + private: + virtual int lineNr() const override; + virtual std::string fileName() const override; + struct Private; + std::unique_ptr p; +}; + +#endif diff --git a/src/xml.l b/src/xml.l new file mode 100644 index 0000000..64fc6c3 --- /dev/null +++ b/src/xml.l @@ -0,0 +1,481 @@ +/****************************************************************************** + * + * Copyright (C) 1997-2020 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ +/****************************************************************************** + * Minimal flex based parser for XML + ******************************************************************************/ + +%option never-interactive +%option prefix="xmlYY" +%option reentrant +%option extra-type="struct xmlYY_state *" +%option 8bit noyywrap +%top{ +#include +} + +%{ + +#include +#include +#include +#include "xml.h" +#include "message.h" + +#define YY_NEVER_INTERACTIVE 1 +#define YY_NO_INPUT 1 +#define YY_NO_UNISTD_H 1 + +struct xmlYY_state +{ + std::string fileName; + int lineNr = 1; + const char * inputString = 0; //!< the code fragment as text + yy_size_t inputPosition = 0; //!< read offset during parsing + std::string name; + bool isEnd = false; + bool selfClose = false; + std::string data; + std::string attrValue; + std::string attrName; + XMLHandlers::Attributes attrs; + XMLHandlers handlers; + int cdataContext; + int commentContext; + char stringChar; + std::vector xpath; +}; + +#if USE_STATE2STRING +static const char *stateToString(int state); +#endif + +static yy_size_t yyread(yyscan_t yyscanner,char *buf,yy_size_t max_size); +static void initElement(yyscan_t yyscanner); +static void addCharacters(yyscan_t yyscanner); +static void addElement(yyscan_t yyscanner); +static void addAttribute(yyscan_t yyscanner); +static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len); +static void reportError(yyscan_t yyscanner, const std::string &msg); +static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len); + +#undef YY_INPUT +#define YY_INPUT(buf,result,max_size) result=yyread(yyscanner,buf,max_size); + +%} + +NL (\r\n|\r|\n) +SP [ \t\r\n]+ +OPEN {SP}?"<" +OPENSPECIAL {SP}?""{NL}? +CLOSESPECIAL "?>"{NL}? +NAMESTART [:A-Za-z\200-\377_] +NAMECHAR [:A-Za-z\200-\377_0-9.-] +NAME {NAMESTART}{NAMECHAR}* +ESC "&#"[0-9]+";"|"&#x"[0-9a-fA-F]+";" +COLON ":" +PCDATA [^<]+ +COMMENT {OPEN}"!--" +COMMENTEND "--"{CLOSE} +STRING \"([^"&]|{ESC})*\"|\'([^'&]|{ESC})*\' +DOCTYPE {SP}?"" + +%option noyywrap + +%s Initial +%s Content +%s CDataSection +%s Element +%s Attributes +%s AttributeValue +%s AttrValueStr +%s Prolog +%s Comment + +%% + +{ + {SP} { countLines(yyscanner,yytext,yyleng); } + {DOCTYPE} { countLines(yyscanner,yytext,yyleng); } + {OPENSPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Prolog); } + {OPEN} { countLines(yyscanner,yytext,yyleng); + initElement(yyscanner); + BEGIN(Element); } + {COMMENT} { yyextra->commentContext = YY_START; + BEGIN(Comment); + } +} +{ + {CDATA} { countLines(yyscanner,yytext,yyleng); + yyextra->cdataContext = YY_START; + BEGIN(CDataSection); + } + {PCDATA} { yyextra->data += processData(yyscanner,yytext,yyleng); } + {OPEN} { countLines(yyscanner,yytext,yyleng); + addCharacters(yyscanner); + initElement(yyscanner); + BEGIN(Element); + } + {COMMENT} { yyextra->commentContext = YY_START; + BEGIN(Comment); + } +} +{ + "/" { yyextra->isEnd = true; } + {NAME} { yyextra->name = yytext; + BEGIN(Attributes); } + {CLOSE} { addElement(yyscanner); + countLines(yyscanner,yytext,yyleng); + yyextra->data = ""; + BEGIN(Content); + } + {SP} { countLines(yyscanner,yytext,yyleng); } +} +{ + "/" { yyextra->selfClose = true; } + {NAME} { yyextra->attrName = yytext; } + "=" { BEGIN(AttributeValue); } + {CLOSE} { addElement(yyscanner); + countLines(yyscanner,yytext,yyleng); + yyextra->data = ""; + BEGIN(Content); + } + {SP} { countLines(yyscanner,yytext,yyleng); } +} +{ + {SP} { countLines(yyscanner,yytext,yyleng); } + ['"] { yyextra->stringChar = *yytext; + yyextra->attrValue = ""; + BEGIN(AttrValueStr); + } + . { std::string msg = std::string("Missing attribute value. Unexpected character `")+yytext+"` found"; + reportError(yyscanner,msg); + unput(*yytext); + BEGIN(Attributes); + } +} +{ + [^'"\n]+ { yyextra->attrValue += processData(yyscanner,yytext,yyleng); } + ['"] { if (*yytext==yyextra->stringChar) + { + addAttribute(yyscanner); + BEGIN(Attributes); + } + else + { + yyextra->attrValue += processData(yyscanner,yytext,yyleng); + } + } + \n { yyextra->lineNr++; yyextra->attrValue+=' '; } +} +{ + {ENDCDATA} { BEGIN(yyextra->cdataContext); } + [^]\n]+ { yyextra->data += yytext; } + \n { yyextra->data += yytext; + yyextra->lineNr++; + } + . { yyextra->data += yytext; } +} +{ + {CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng); + BEGIN(Initial); + } + [^?\n]+ { } + \n { yyextra->lineNr++; } + . { } +} +{ + {COMMENTEND} { BEGIN(yyextra->commentContext); } + [^\n-]+ { } + \n { yyextra->lineNr++; } + . { } +} +\n { yyextra->lineNr++; } +. { std::string msg = "Unexpected character `"; + msg+=yytext; + msg+="` found"; + reportError(yyscanner,msg); + } + +%% + +//---------------------------------------------------------------------------------------- + +static yy_size_t yyread(yyscan_t yyscanner,char *buf,size_t max_size) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + yy_size_t inputPosition = yyextra->inputPosition; + const char *s = yyextra->inputString + inputPosition; + yy_size_t c=0; + while( c < max_size && *s) + { + *buf++ = *s++; + c++; + } + yyextra->inputPosition += c; + return c; +} + +static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + for (yy_size_t i=0;ilineNr++; + } +} + +static void initElement(yyscan_t yyscanner) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + yyextra->isEnd = false; // true => + yyextra->selfClose = false; // true => + yyextra->name = ""; + yyextra->attrs.clear(); +} + +static void checkAndUpdatePath(yyscan_t yyscanner) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + if (yyextra->xpath.empty()) + { + std::string msg = "found closing tag '"+yyextra->name+"' without matching opening tag"; + reportError(yyscanner,msg); + } + else + { + std::string expectedTagName = yyextra->xpath.back(); + if (expectedTagName!=yyextra->name) + { + std::string msg = "Found closing tag '"+yyextra->name+"' that does not match the opening tag '"+expectedTagName+"' at the same level"; + reportError(yyscanner,msg); + } + else // matching end tag + { + yyextra->xpath.pop_back(); + } + } +} + +static void addElement(yyscan_t yyscanner) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + if (!yyextra->isEnd) + { + yyextra->xpath.push_back(yyextra->name); + if (yyextra->handlers.startElement) + { + yyextra->handlers.startElement(yyextra->name,yyextra->attrs); + } + if (yy_flex_debug) + { + fprintf(stderr,"startElement(%s,attr=[",yyextra->name.data()); + for (auto attr : yyextra->attrs) + { + fprintf(stderr,"%s='%s' ",attr.first.c_str(),attr.second.c_str()); + } + fprintf(stderr,"])\n"); + } + } + if (yyextra->isEnd || yyextra->selfClose) + { + if (yy_flex_debug) + { + fprintf(stderr,"endElement(%s)\n",yyextra->name.data()); + } + checkAndUpdatePath(yyscanner); + if (yyextra->handlers.endElement) + { + yyextra->handlers.endElement(yyextra->name); + } + } +} + +static std::string trimSpaces(const std::string &str) +{ + const int l = static_cast(str.length()); + int s=0, e=l-1; + while (ss && isspace(str.at(e))) e--; + return str.substr(s,1+e-s); +} + +static void addCharacters(yyscan_t yyscanner) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + std::string data = trimSpaces(yyextra->data); + if (yyextra->handlers.characters) + { + yyextra->handlers.characters(data); + } + if (!data.empty()) + { + if (yy_flex_debug) + { + fprintf(stderr,"characters(%s)\n",data.c_str()); + } + } +} + +static void addAttribute(yyscan_t yyscanner) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + yyextra->attrs.insert(std::make_pair(yyextra->attrName,yyextra->attrValue)); +} + +static void reportError(yyscan_t yyscanner,const std::string &msg) +{ + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + if (yy_flex_debug) + { + fprintf(stderr,"%s:%d: Error '%s'\n",yyextra->fileName.c_str(),yyextra->lineNr,msg.c_str()); + } + if (yyextra->handlers.error) + { + yyextra->handlers.error(yyextra->fileName,yyextra->lineNr,msg); + } +} + +static const char *entities_enc[] = { "amp", "quot", "gt", "lt", "apos" }; +static const char entities_dec[] = { '&', '"', '>', '<', '\'' }; +static const int num_entities = 5; + +// replace character entities such as & in txt and return the string where entities +// are replaced +static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len) +{ + std::string result; + result.reserve(len); + for (yy_size_t i=0; ixmlYY_extra,&p->yyscanner); + p->xmlYY_extra.handlers = handlers; +} + +XMLParser::~XMLParser() +{ + xmlYYlex_destroy(p->yyscanner); +} + +void XMLParser::parse(const char *fileName,const char *inputStr) +{ + yyscan_t yyscanner = p->yyscanner; + struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; + +#ifdef FLEX_DEBUG + xmlYYset_debug(1,p->yyscanner); +#endif + + if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input + + printlex(yy_flex_debug, true, __FILE__, fileName); + + BEGIN(Initial); + yyextra->fileName = fileName; + yyextra->lineNr = 1; + yyextra->inputString = inputStr; + yyextra->inputPosition = 0; + + xmlYYrestart( 0, yyscanner ); + + if (yyextra->handlers.startDocument) + { + yyextra->handlers.startDocument(); + } + xmlYYlex(yyscanner); + if (yyextra->handlers.endDocument) + { + yyextra->handlers.endDocument(); + } + + if (!yyextra->xpath.empty()) + { + std::string tagName = yyextra->xpath.back(); + std::string msg = "End of file reached while expecting closing tag '"+tagName+"'"; + reportError(yyscanner,msg); + } + + printlex(yy_flex_debug, false, __FILE__, fileName); +} + +int XMLParser::lineNr() const +{ + struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner; + return yyextra->lineNr; +} + +std::string XMLParser::fileName() const +{ + struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner; + return yyextra->fileName; +} + +#if USE_STATE2STRING +#include "xml.l.h" +#endif -- cgit v0.12