summaryrefslogtreecommitdiffstats
path: root/src/xml.l
diff options
context:
space:
mode:
Diffstat (limited to 'src/xml.l')
-rw-r--r--src/xml.l484
1 files changed, 484 insertions, 0 deletions
diff --git a/src/xml.l b/src/xml.l
new file mode 100644
index 0000000..ace35d5
--- /dev/null
+++ b/src/xml.l
@@ -0,0 +1,484 @@
+/******************************************************************************
+ *
+ * Copyright (C) 1997-2020 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+/******************************************************************************
+ * Minimal flex based parser for XML
+ ******************************************************************************/
+
+%option never-interactive
+%option prefix="xmlYY"
+%option reentrant
+%option extra-type="struct xmlYY_state *"
+%option 8bit noyywrap
+%top{
+#include <stdint.h>
+}
+
+%{
+
+#include <ctype.h>
+#include <vector>
+#include <stdio.h>
+#include "xml.h"
+#include "message.h"
+
+#define YY_NEVER_INTERACTIVE 1
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+
+struct xmlYY_state
+{
+ std::string fileName;
+ int lineNr = 1;
+ const char * inputString = 0; //!< the code fragment as text
+ yy_size_t inputPosition = 0; //!< read offset during parsing
+ std::string name;
+ bool isEnd = false;
+ bool selfClose = false;
+ std::string data;
+ std::string attrValue;
+ std::string attrName;
+ XMLHandlers::Attributes attrs;
+ XMLHandlers handlers;
+ int cdataContext;
+ int commentContext;
+ char stringChar;
+ std::vector<std::string> xpath;
+};
+
+#if USE_STATE2STRING
+static const char *stateToString(int state);
+#endif
+
+static yy_size_t yyread(yyscan_t yyscanner,char *buf,yy_size_t max_size);
+static void initElement(yyscan_t yyscanner);
+static void addCharacters(yyscan_t yyscanner);
+static void addElement(yyscan_t yyscanner);
+static void addAttribute(yyscan_t yyscanner);
+static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len);
+static void reportError(yyscan_t yyscanner, const std::string &msg);
+static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len);
+
+#undef YY_INPUT
+#define YY_INPUT(buf,result,max_size) result=yyread(yyscanner,buf,max_size);
+
+%}
+
+NL (\r\n|\r|\n)
+SP [ \t\r\n]+
+OPEN {SP}?"<"
+OPENSPECIAL {SP}?"<?"
+CLOSE ">"{NL}?
+CLOSESPECIAL "?>"{NL}?
+NAMESTART [:A-Za-z\200-\377_]
+NAMECHAR [:A-Za-z\200-\377_0-9.-]
+NAME {NAMESTART}{NAMECHAR}*
+ESC "&#"[0-9]+";"|"&#x"[0-9a-fA-F]+";"
+COLON ":"
+PCDATA [^<]+
+COMMENT {OPEN}"!--"
+COMMENTEND "--"{CLOSE}
+STRING \"([^"&]|{ESC})*\"|\'([^'&]|{ESC})*\'
+DOCTYPE {SP}?"<!DOCTYPE"{SP}
+CDATA {SP}?"<![CDATA["
+ENDCDATA "]]>"
+
+%option noyywrap
+
+%s Initial
+%s Content
+%s CDataSection
+%s Element
+%s Attributes
+%s AttributeValue
+%s AttrValueStr
+%s Prolog
+%s Comment
+
+%%
+
+<Initial>{
+ {SP} { countLines(yyscanner,yytext,yyleng); }
+ {DOCTYPE} { countLines(yyscanner,yytext,yyleng); }
+ {OPENSPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Prolog); }
+ {OPEN} { countLines(yyscanner,yytext,yyleng);
+ initElement(yyscanner);
+ BEGIN(Element); }
+ {COMMENT} { yyextra->commentContext = YY_START;
+ BEGIN(Comment);
+ }
+}
+<Content>{
+ {CDATA} { countLines(yyscanner,yytext,yyleng);
+ yyextra->cdataContext = YY_START;
+ BEGIN(CDataSection);
+ }
+ {PCDATA} { yyextra->data += processData(yyscanner,yytext,yyleng); }
+ {OPEN} { countLines(yyscanner,yytext,yyleng);
+ addCharacters(yyscanner);
+ initElement(yyscanner);
+ BEGIN(Element);
+ }
+ {COMMENT} { yyextra->commentContext = YY_START;
+ countLines(yyscanner,yytext,yyleng);
+ BEGIN(Comment);
+ }
+}
+<Element>{
+ "/" { yyextra->isEnd = true; }
+ {NAME} { yyextra->name = yytext;
+ BEGIN(Attributes); }
+ {CLOSE} { addElement(yyscanner);
+ countLines(yyscanner,yytext,yyleng);
+ yyextra->data = "";
+ BEGIN(Content);
+ }
+ {SP} { countLines(yyscanner,yytext,yyleng); }
+}
+<Attributes>{
+ "/" { yyextra->selfClose = true; }
+ {NAME} { yyextra->attrName = yytext; }
+ "=" { BEGIN(AttributeValue); }
+ {CLOSE} { addElement(yyscanner);
+ countLines(yyscanner,yytext,yyleng);
+ yyextra->data = "";
+ BEGIN(Content);
+ }
+ {SP} { countLines(yyscanner,yytext,yyleng); }
+}
+<AttributeValue>{
+ {SP} { countLines(yyscanner,yytext,yyleng); }
+ ['"] { yyextra->stringChar = *yytext;
+ yyextra->attrValue = "";
+ BEGIN(AttrValueStr);
+ }
+ . { std::string msg = std::string("Missing attribute value. Unexpected character `")+yytext+"` found";
+ reportError(yyscanner,msg);
+ unput(*yytext);
+ BEGIN(Attributes);
+ }
+}
+<AttrValueStr>{
+ [^'"\n]+ { yyextra->attrValue += processData(yyscanner,yytext,yyleng); }
+ ['"] { if (*yytext==yyextra->stringChar)
+ {
+ addAttribute(yyscanner);
+ BEGIN(Attributes);
+ }
+ else
+ {
+ yyextra->attrValue += processData(yyscanner,yytext,yyleng);
+ }
+ }
+ \n { yyextra->lineNr++; yyextra->attrValue+=' '; }
+}
+<CDataSection>{
+ {ENDCDATA} { BEGIN(yyextra->cdataContext); }
+ [^]\n]+ { yyextra->data += yytext; }
+ \n { yyextra->data += yytext;
+ yyextra->lineNr++;
+ }
+ . { yyextra->data += yytext; }
+}
+<Prolog>{
+ {CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng);
+ BEGIN(Initial);
+ }
+ [^?\n]+ { }
+ \n { yyextra->lineNr++; }
+ . { }
+}
+<Comment>{
+ {COMMENTEND} { countLines(yyscanner,yytext,yyleng);
+ BEGIN(yyextra->commentContext);
+ }
+ [^\n-]+ { }
+ \n { yyextra->lineNr++; }
+ . { }
+}
+\n { yyextra->lineNr++; }
+. { std::string msg = "Unexpected character `";
+ msg+=yytext;
+ msg+="` found";
+ reportError(yyscanner,msg);
+ }
+
+%%
+
+//----------------------------------------------------------------------------------------
+
+static yy_size_t yyread(yyscan_t yyscanner,char *buf,size_t max_size)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ yy_size_t inputPosition = yyextra->inputPosition;
+ const char *s = yyextra->inputString + inputPosition;
+ yy_size_t c=0;
+ while( c < max_size && *s)
+ {
+ *buf++ = *s++;
+ c++;
+ }
+ yyextra->inputPosition += c;
+ return c;
+}
+
+static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ for (yy_size_t i=0;i<len;i++)
+ {
+ if (txt[i]=='\n') yyextra->lineNr++;
+ }
+}
+
+static void initElement(yyscan_t yyscanner)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ yyextra->isEnd = false; // true => </tag>
+ yyextra->selfClose = false; // true => <tag/>
+ yyextra->name = "";
+ yyextra->attrs.clear();
+}
+
+static void checkAndUpdatePath(yyscan_t yyscanner)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ if (yyextra->xpath.empty())
+ {
+ std::string msg = "found closing tag '"+yyextra->name+"' without matching opening tag";
+ reportError(yyscanner,msg);
+ }
+ else
+ {
+ std::string expectedTagName = yyextra->xpath.back();
+ if (expectedTagName!=yyextra->name)
+ {
+ std::string msg = "Found closing tag '"+yyextra->name+"' that does not match the opening tag '"+expectedTagName+"' at the same level";
+ reportError(yyscanner,msg);
+ }
+ else // matching end tag
+ {
+ yyextra->xpath.pop_back();
+ }
+ }
+}
+
+static void addElement(yyscan_t yyscanner)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ if (!yyextra->isEnd)
+ {
+ yyextra->xpath.push_back(yyextra->name);
+ if (yyextra->handlers.startElement)
+ {
+ yyextra->handlers.startElement(yyextra->name,yyextra->attrs);
+ }
+ if (yy_flex_debug)
+ {
+ fprintf(stderr,"%d: startElement(%s,attr=[",yyextra->lineNr,yyextra->name.data());
+ for (auto attr : yyextra->attrs)
+ {
+ fprintf(stderr,"%s='%s' ",attr.first.c_str(),attr.second.c_str());
+ }
+ fprintf(stderr,"])\n");
+ }
+ }
+ if (yyextra->isEnd || yyextra->selfClose)
+ {
+ if (yy_flex_debug)
+ {
+ fprintf(stderr,"%d: endElement(%s)\n",yyextra->lineNr,yyextra->name.data());
+ }
+ checkAndUpdatePath(yyscanner);
+ if (yyextra->handlers.endElement)
+ {
+ yyextra->handlers.endElement(yyextra->name);
+ }
+ }
+}
+
+static std::string trimSpaces(const std::string &str)
+{
+ const int l = static_cast<int>(str.length());
+ int s=0, e=l-1;
+ while (s<l && isspace(str.at(s))) s++;
+ while (e>s && isspace(str.at(e))) e--;
+ return str.substr(s,1+e-s);
+}
+
+static void addCharacters(yyscan_t yyscanner)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ std::string data = trimSpaces(yyextra->data);
+ if (yyextra->handlers.characters)
+ {
+ yyextra->handlers.characters(data);
+ }
+ if (!data.empty())
+ {
+ if (yy_flex_debug)
+ {
+ fprintf(stderr,"characters(%s)\n",data.c_str());
+ }
+ }
+}
+
+static void addAttribute(yyscan_t yyscanner)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ yyextra->attrs.insert(std::make_pair(yyextra->attrName,yyextra->attrValue));
+}
+
+static void reportError(yyscan_t yyscanner,const std::string &msg)
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+ if (yy_flex_debug)
+ {
+ fprintf(stderr,"%s:%d: Error '%s'\n",yyextra->fileName.c_str(),yyextra->lineNr,msg.c_str());
+ }
+ if (yyextra->handlers.error)
+ {
+ yyextra->handlers.error(yyextra->fileName,yyextra->lineNr,msg);
+ }
+}
+
+static const char *entities_enc[] = { "amp", "quot", "gt", "lt", "apos" };
+static const char entities_dec[] = { '&', '"', '>', '<', '\'' };
+static const int num_entities = 5;
+
+// replace character entities such as &amp; in txt and return the string where entities
+// are replaced
+static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len)
+{
+ std::string result;
+ result.reserve(len);
+ for (yy_size_t i=0; i<len; i++)
+ {
+ char c = txt[i];
+ if (c=='&')
+ {
+ const int maxEntityLen = 10;
+ char entity[maxEntityLen+1];
+ entity[maxEntityLen]='\0';
+ for (yy_size_t j=0; j<maxEntityLen && i+j+1<len; j++)
+ {
+ if (txt[i+j+1]!=';')
+ {
+ entity[j]=txt[i+j+1];
+ }
+ else
+ {
+ entity[j]=0;
+ break;
+ }
+ }
+ bool found=false;
+ for (int e=0; !found && e<num_entities; e++)
+ {
+ if (strcmp(entity,entities_enc[e])==0)
+ {
+ result+=entities_dec[e];
+ i+=strlen(entities_enc[e])+1;
+ found=true;
+ }
+ }
+ if (!found)
+ {
+ std::string msg = std::string("Invalid character entity '&") + entity + ";' found\n";
+ reportError(yyscanner,msg);
+ }
+ }
+ else
+ {
+ result+=c;
+ }
+ }
+ return result;
+}
+
+//--------------------------------------------------------------
+
+struct XMLParser::Private
+{
+ yyscan_t yyscanner;
+ struct xmlYY_state xmlYY_extra;
+};
+
+XMLParser::XMLParser(const XMLHandlers &handlers) : p(new Private)
+{
+ xmlYYlex_init_extra(&p->xmlYY_extra,&p->yyscanner);
+ p->xmlYY_extra.handlers = handlers;
+}
+
+XMLParser::~XMLParser()
+{
+ xmlYYlex_destroy(p->yyscanner);
+}
+
+void XMLParser::parse(const char *fileName,const char *inputStr)
+{
+ yyscan_t yyscanner = p->yyscanner;
+ struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
+
+#ifdef FLEX_DEBUG
+ xmlYYset_debug(1,p->yyscanner);
+#endif
+
+ if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input
+
+ printlex(yy_flex_debug, true, __FILE__, fileName);
+
+ BEGIN(Initial);
+ yyextra->fileName = fileName;
+ yyextra->lineNr = 1;
+ yyextra->inputString = inputStr;
+ yyextra->inputPosition = 0;
+
+ xmlYYrestart( 0, yyscanner );
+
+ if (yyextra->handlers.startDocument)
+ {
+ yyextra->handlers.startDocument();
+ }
+ xmlYYlex(yyscanner);
+ if (yyextra->handlers.endDocument)
+ {
+ yyextra->handlers.endDocument();
+ }
+
+ if (!yyextra->xpath.empty())
+ {
+ std::string tagName = yyextra->xpath.back();
+ std::string msg = "End of file reached while expecting closing tag '"+tagName+"'";
+ reportError(yyscanner,msg);
+ }
+
+ printlex(yy_flex_debug, false, __FILE__, fileName);
+}
+
+int XMLParser::lineNr() const
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
+ return yyextra->lineNr;
+}
+
+std::string XMLParser::fileName() const
+{
+ struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
+ return yyextra->fileName;
+}
+
+#if USE_STATE2STRING
+#include "xml.l.h"
+#endif