summaryrefslogtreecommitdiffstats
path: root/tkhtml1/src/htmlparse.c
diff options
context:
space:
mode:
Diffstat (limited to 'tkhtml1/src/htmlparse.c')
-rw-r--r--tkhtml1/src/htmlparse.c1181
1 files changed, 1181 insertions, 0 deletions
diff --git a/tkhtml1/src/htmlparse.c b/tkhtml1/src/htmlparse.c
new file mode 100644
index 0000000..4511005
--- /dev/null
+++ b/tkhtml1/src/htmlparse.c
@@ -0,0 +1,1181 @@
+/*
+** A tokenizer that converts raw HTML into a linked list of HTML elements.
+**
+** Copyright (C) 1997-2000 D. Richard Hipp
+**
+** This library is free software; you can redistribute it and/or
+** modify it under the terms of the GNU Library General Public
+** License as published by the Free Software Foundation; either
+** version 2 of the License, or (at your option) any later version.
+**
+** This library is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+** Library General Public License for more details.
+**
+** You should have received a copy of the GNU Library General Public
+** License along with this library; if not, write to the
+** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+** Boston, MA 02111-1307, USA.
+**
+** Author contact information:
+** drh@acm.org
+** http://www.hwaci.com/drh/
+*/
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <tk.h>
+#include "htmlparse.h"
+
+/****************** Begin Escape Sequence Translator *************/
+/*
+** The next section of code implements routines used to translate
+** the '&' escape sequences of SGML to individual characters.
+** Examples:
+**
+** &amp; &
+** &lt; <
+** &gt; >
+** &nbsp; nonbreakable space
+*/
+
+/* Each escape sequence is recorded as an instance of the following
+** structure
+*/
+struct sgEsc {
+ char *zName; /* The name of this escape sequence. ex: "amp" */
+ char value[8]; /* The value for this sequence. ex: "&" */
+ struct sgEsc *pNext; /* Next sequence with the same hash on zName */
+};
+
+/* The following is a table of all escape sequences. Add new sequences
+** by adding entries to this table.
+*/
+static struct sgEsc esc_sequences[] = {
+ { "quot", "\"", 0 },
+ { "amp", "&", 0 },
+ { "lt", "<", 0 },
+ { "gt", ">", 0 },
+ { "nbsp", " ", 0 },
+ { "iexcl", "\241", 0 },
+ { "cent", "\242", 0 },
+ { "pound", "\243", 0 },
+ { "curren", "\244", 0 },
+ { "yen", "\245", 0 },
+ { "brvbar", "\246", 0 },
+ { "sect", "\247", 0 },
+ { "uml", "\250", 0 },
+ { "copy", "\251", 0 },
+ { "ordf", "\252", 0 },
+ { "laquo", "\253", 0 },
+ { "not", "\254", 0 },
+ { "shy", "\255", 0 },
+ { "reg", "\256", 0 },
+ { "macr", "\257", 0 },
+ { "deg", "\260", 0 },
+ { "plusmn", "\261", 0 },
+ { "sup2", "\262", 0 },
+ { "sup3", "\263", 0 },
+ { "acute", "\264", 0 },
+ { "micro", "\265", 0 },
+ { "para", "\266", 0 },
+ { "middot", "\267", 0 },
+ { "cedil", "\270", 0 },
+ { "sup1", "\271", 0 },
+ { "ordm", "\272", 0 },
+ { "raquo", "\273", 0 },
+ { "frac14", "\274", 0 },
+ { "frac12", "\275", 0 },
+ { "frac34", "\276", 0 },
+ { "iquest", "\277", 0 },
+ { "Agrave", "\300", 0 },
+ { "Aacute", "\301", 0 },
+ { "Acirc", "\302", 0 },
+ { "Atilde", "\303", 0 },
+ { "Auml", "\304", 0 },
+ { "Aring", "\305", 0 },
+ { "AElig", "\306", 0 },
+ { "Ccedil", "\307", 0 },
+ { "Egrave", "\310", 0 },
+ { "Eacute", "\311", 0 },
+ { "Ecirc", "\312", 0 },
+ { "Euml", "\313", 0 },
+ { "Igrave", "\314", 0 },
+ { "Iacute", "\315", 0 },
+ { "Icirc", "\316", 0 },
+ { "Iuml", "\317", 0 },
+ { "ETH", "\320", 0 },
+ { "Ntilde", "\321", 0 },
+ { "Ograve", "\322", 0 },
+ { "Oacute", "\323", 0 },
+ { "Ocirc", "\324", 0 },
+ { "Otilde", "\325", 0 },
+ { "Ouml", "\326", 0 },
+ { "times", "\327", 0 },
+ { "Oslash", "\330", 0 },
+ { "Ugrave", "\331", 0 },
+ { "Uacute", "\332", 0 },
+ { "Ucirc", "\333", 0 },
+ { "Uuml", "\334", 0 },
+ { "Yacute", "\335", 0 },
+ { "THORN", "\336", 0 },
+ { "szlig", "\337", 0 },
+ { "agrave", "\340", 0 },
+ { "aacute", "\341", 0 },
+ { "acirc", "\342", 0 },
+ { "atilde", "\343", 0 },
+ { "auml", "\344", 0 },
+ { "aring", "\345", 0 },
+ { "aelig", "\346", 0 },
+ { "ccedil", "\347", 0 },
+ { "egrave", "\350", 0 },
+ { "eacute", "\351", 0 },
+ { "ecirc", "\352", 0 },
+ { "euml", "\353", 0 },
+ { "igrave", "\354", 0 },
+ { "iacute", "\355", 0 },
+ { "icirc", "\356", 0 },
+ { "iuml", "\357", 0 },
+ { "eth", "\360", 0 },
+ { "ntilde", "\361", 0 },
+ { "ograve", "\362", 0 },
+ { "oacute", "\363", 0 },
+ { "ocirc", "\364", 0 },
+ { "otilde", "\365", 0 },
+ { "ouml", "\366", 0 },
+ { "divide", "\367", 0 },
+ { "oslash", "\370", 0 },
+ { "ugrave", "\371", 0 },
+ { "uacute", "\372", 0 },
+ { "ucirc", "\373", 0 },
+ { "uuml", "\374", 0 },
+ { "yacute", "\375", 0 },
+ { "thorn", "\376", 0 },
+ { "yuml", "\377", 0 },
+};
+
+/* The size of the handler hash table. For best results this should
+** be a prime number which is about the same size as the number of
+** escape sequences known to the system. */
+#define ESC_HASH_SIZE (sizeof(esc_sequences)/sizeof(esc_sequences[0])+7)
+
+/* The hash table
+**
+** If the name of an escape sequences hashes to the value H, then
+** apEscHash[H] will point to a linked list of Esc structures, one of
+** which will be the Esc structure for that escape sequence.
+*/
+static struct sgEsc *apEscHash[ESC_HASH_SIZE];
+
+/* Hash a escape sequence name. The value returned is an integer
+** between 0 and ESC_HASH_SIZE-1, inclusive.
+*/
+static int EscHash(const char *zName){
+ int h = 0; /* The hash value to be returned */
+ char c; /* The next character in the name being hashed */
+
+ while( (c=*zName)!=0 ){
+ h = h<<5 ^ h ^ c;
+ zName++;
+ TestPoint(0);
+ }
+ if( h<0 ){
+ h = -h;
+ TestPoint(0);
+ }else{
+ TestPoint(0);
+ }
+ return h % ESC_HASH_SIZE;
+}
+
+#ifdef TEST
+/*
+** Compute the longest and average collision chain length for the
+** escape sequence hash table
+*/
+static void EscHashStats(void){
+ int i;
+ int sum = 0;
+ int max = 0;
+ int cnt;
+ int notempty = 0;
+ struct sgEsc *p;
+
+ for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[0]); i++){
+ cnt = 0;
+ p = apEscHash[i];
+ if( p ) notempty++;
+ while( p ){
+ cnt++;
+ p = p->pNext;
+ }
+ sum += cnt;
+ if( cnt>max ) max = cnt;
+ }
+ printf("Longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
+ max,(double)sum/(double)notempty, i, i-notempty,
+ 100.0*(i-notempty)/(double)i);
+}
+#endif
+
+/* Initialize the escape sequence hash table
+*/
+static void EscInit(void){
+ int i; /* For looping thru the list of escape sequences */
+ int h; /* The hash on a sequence */
+
+ for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[i]); i++){
+/* #ifdef TCL_UTF_MAX */
+#if 0
+ {
+ int c = esc_sequences[i].value[0];
+ Tcl_UniCharToUtf(c, esc_sequences[i].value);
+ }
+#endif
+ h = EscHash(esc_sequences[i].zName);
+ esc_sequences[i].pNext = apEscHash[h];
+ apEscHash[h] = &esc_sequences[i];
+ TestPoint(0);
+ }
+#ifdef TEST
+ EscHashStats();
+#endif
+}
+
+/*
+** This table translates the non-standard microsoft characters between
+** 0x80 and 0x9f into plain ASCII so that the characters will be visible
+** on Unix systems. Care is taken to translate the characters
+** into values less than 0x80, to avoid UTF-8 problems.
+*/
+#ifndef __WIN32__
+static char acMsChar[] = {
+ /* 0x80 */ 'C',
+ /* 0x81 */ ' ',
+ /* 0x82 */ ',',
+ /* 0x83 */ 'f',
+ /* 0x84 */ '"',
+ /* 0x85 */ '.',
+ /* 0x86 */ '*',
+ /* 0x87 */ '*',
+ /* 0x88 */ '^',
+ /* 0x89 */ '%',
+ /* 0x8a */ 'S',
+ /* 0x8b */ '<',
+ /* 0x8c */ 'O',
+ /* 0x8d */ ' ',
+ /* 0x8e */ 'Z',
+ /* 0x8f */ ' ',
+ /* 0x90 */ ' ',
+ /* 0x91 */ '\'',
+ /* 0x92 */ '\'',
+ /* 0x93 */ '"',
+ /* 0x94 */ '"',
+ /* 0x95 */ '*',
+ /* 0x96 */ '-',
+ /* 0x97 */ '-',
+ /* 0x98 */ '~',
+ /* 0x99 */ '@',
+ /* 0x9a */ 's',
+ /* 0x9b */ '>',
+ /* 0x9c */ 'o',
+ /* 0x9d */ ' ',
+ /* 0x9e */ 'z',
+ /* 0x9f */ 'Y',
+};
+#endif
+
+/* Translate escape sequences in the string "z". "z" is overwritten
+** with the translated sequence.
+**
+** Unrecognized escape sequences are unaltered.
+**
+** Example:
+**
+** input = "AT&amp;T &gt MCI"
+** output = "AT&T > MCI"
+*/
+LOCAL void HtmlTranslateEscapes(char *z){
+ int from; /* Read characters from this position in z[] */
+ int to; /* Write characters into this position in z[] */
+ int h; /* A hash on the escape sequence */
+ struct sgEsc *p; /* For looping down the escape sequence collision chain */
+ static int isInit = 0; /* True after initialization */
+
+ from = to = 0;
+ if( !isInit ){
+ EscInit();
+ isInit = 1;
+ }
+ while( z[from] ){
+ if( z[from]=='&' ){
+ if( z[from+1]=='#' ){
+ int i = from + 2;
+ int v = 0;
+ while( isdigit(z[i]) ){
+ v = v*10 + z[i] - '0';
+ i++;
+ }
+ if( z[i]==';' ){ i++; }
+
+ /* On Unix systems, translate the non-standard microsoft
+ ** characters in the range of 0x80 to 0x9f into something
+ ** we can see.
+ */
+#ifndef __WIN32__
+ if( v>=0x80 && v<0xa0 ){
+ v = acMsChar[v&0x1f];
+ }
+#endif
+ /* Put the character in the output stream in place of
+ ** the "&#000;". How we do this depends on whether or
+ ** not we are using UTF-8.
+ */
+#ifdef TCL_UTF_MAX
+ {
+ int j, n;
+ char value[8];
+ n = Tcl_UniCharToUtf(v,value);
+ for(j=0; j<n; j++){
+ z[to++] = value[j];
+ }
+ }
+#else
+ z[to++] = v;
+#endif
+ from = i;
+ }else{
+ int i = from+1;
+ int c;
+ while( z[i] && isalnum(z[i]) ){ TestPoint(0); i++; }
+ c = z[i];
+ z[i] = 0;
+ h = EscHash(&z[from+1]);
+ p = apEscHash[h];
+ while( p && strcmp(p->zName,&z[from+1])!=0 ){
+ p = p->pNext;
+ }
+ z[i] = c;
+ if( p ){
+ int j;
+ for(j=0; p->value[j]; j++){
+ z[to++] = p->value[j];
+ }
+ from = i;
+ if( c==';' ){
+ from++;
+ }
+ }else{
+ z[to++] = z[from++];
+ }
+ }
+
+ /* On UNIX systems, look for the non-standard microsoft characters
+ ** between 0x80 and 0x9f and translate them into printable ASCII
+ ** codes. Separate algorithms are required to do this for plain
+ ** ascii and for utf-8.
+ */
+#ifndef __WIN32__
+#ifdef TCL_UTF_MAX
+ }else if( (z[from]&0x80)!=0 ){
+ Tcl_UniChar c;
+ int n;
+ n = Tcl_UtfToUniChar(&z[from], &c);
+ if( c>=0x80 && c<0xa0 ){
+ z[to++] = acMsChar[c & 0x1f];
+ from += n;
+ }else{
+ while( n-- ) z[to++] = z[from++];
+ }
+#else /* if !defined(TCL_UTF_MAX) */
+ }else if( ((unsigned char)z[from])>=0x80 && ((unsigned char)z[from])<0xa0 ){
+ z[to++] = acMsChar[z[from++]&0x1f];
+#endif /* TCL_UTF_MAX */
+#endif /* __WIN32__ */
+ }else{
+ z[to++] = z[from++];
+ TestPoint(0);
+ }
+ }
+ z[to] = 0;
+}
+/******************* End Escape Sequence Translator ***************/
+
+/******************* Begin HTML tokenizer code *******************/
+/*
+** The following variable becomes TRUE when the markup hash table
+** (stored in HtmlMarkupMap[]) is initialized.
+*/
+static int isInit = 0;
+
+/* The hash table for HTML markup names.
+**
+** If an HTML markup name hashes to H, then apMap[H] will point to
+** a linked list of sgMap structure, one of which will describe the
+** the particular markup (if it exists.)
+*/
+static HtmlTokenMap *apMap[HTML_MARKUP_HASH_SIZE];
+
+/* Hash a markup name
+**
+** HTML markup is case insensitive, so this function will give the
+** same hash regardless of the case of the markup name.
+**
+** The value returned is an integer between 0 and HTML_MARKUP_HASH_SIZE-1,
+** inclusive.
+*/
+static int HtmlHash(const char *zName){
+ int h = 0;
+ char c;
+ while( (c=*zName)!=0 ){
+ if( isupper(c) ){
+ c = tolower(c);
+ }
+ h = h<<5 ^ h ^ c;
+ zName++;
+ }
+ if( h<0 ){
+ h = -h;
+ }
+ return h % HTML_MARKUP_HASH_SIZE;
+}
+
+#ifdef TEST
+/*
+** Compute the longest and average collision chain length for the
+** markup hash table
+*/
+static void HtmlHashStats(void){
+ int i;
+ int sum = 0;
+ int max = 0;
+ int cnt;
+ int notempty = 0;
+ struct sgMap *p;
+
+ for(i=0; i<HTML_MARKUP_COUNT; i++){
+ cnt = 0;
+ p = apMap[i];
+ if( p ) notempty++;
+ while( p ){
+ cnt++;
+ p = p->pCollide;
+ }
+ sum += cnt;
+ if( cnt>max ) max = cnt;
+
+ }
+ printf("longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
+ max, (double)sum/(double)notempty, i, i-notempty,
+ 100.0*(i-notempty)/(double)i);
+}
+#endif
+
+/* Initialize the escape sequence hash table
+*/
+static void HtmlHashInit(void){
+ int i; /* For looping thru the list of markup names */
+ int h; /* The hash on a markup name */
+
+ for(i=0; i<HTML_MARKUP_COUNT; i++){
+ h = HtmlHash(HtmlMarkupMap[i].zName);
+ HtmlMarkupMap[i].pCollide = apMap[h];
+ apMap[h] = &HtmlMarkupMap[i];
+ }
+#ifdef TEST
+ HtmlHashStats();
+#endif
+}
+
+/*
+** Append the given HtmlElement to the tokenizers list of elements
+*/
+static void AppendElement(HtmlWidget *p, HtmlElement *pElem){
+ pElem->base.pNext = 0;
+ pElem->base.pPrev = p->pLast;
+ if( p->pFirst==0 ){
+ p->pFirst = pElem;
+ }else{
+ p->pLast->base.pNext = pElem;
+ }
+ p->pLast = pElem;
+ p->nToken++;
+}
+
+/*
+** Compute the new column index following the given character.
+*/
+static int NextColumn(int iCol, char c){
+ switch( c ){
+ case '\n': return 0;
+ case '\t': return (iCol | 7) + 1;
+ default: return iCol+1;
+ }
+ /* NOT REACHED */
+}
+
+/*
+** Convert a string to all lower-case letters.
+*/
+static void ToLower(char *z){
+ while( *z ){
+ if( isupper(*z) ) *z = tolower(*z);
+ z++;
+ }
+}
+
+/* Process as much of the input HTML as possible. Construct new
+** HtmlElement structures and appended them to the list. Return
+** the number of characters actually processed.
+**
+** This routine may invoke a callback procedure which could delete
+** the HTML widget.
+**
+** This routine is not reentrant for the same HTML widget. To
+** prevent reentrancy (during a callback), the p->iCol field is
+** set to a negative number. This is a flag to future invocations
+** not to reentry this routine. The p->iCol field is restored
+** before exiting, of course.
+*/
+static int Tokenize(
+ HtmlWidget *p /* The HTML widget doing the parsing */
+){
+ char *z; /* The input HTML text */
+ int c; /* The next character of input */
+ int n; /* Number of characters processed so far */
+ int iCol; /* Column of input */
+ int i, j; /* Loop counters */
+ int h; /* Result from HtmlHash() */
+ int nByte; /* Space allocated for a single HtmlElement */
+ HtmlElement *pElem; /* A new HTML element */
+ int selfClose; /* True for content free elements. Ex: <br/> */
+ int argc; /* The number of arguments on a markup */
+ HtmlTokenMap *pMap; /* For searching the markup name hash table */
+ char *zBuf; /* For handing out buffer space */
+# define mxARG 200 /* Maximum number of parameters in a single markup */
+ char *argv[mxARG]; /* Pointers to each markup argument. */
+ int arglen[mxARG]; /* Length of each markup argument */
+
+ iCol = p->iCol;
+ n = p->nComplete;
+ z = p->zText;
+ if( iCol<0 ){ TestPoint(0); return n; } /* Prevents recursion */
+ p->iCol = -1;
+ while( (c=z[n])!=0 ){
+ if( p->pScript ){
+ /* We are in the middle of <SCRIPT>...</SCRIPT>. Just look for
+ ** the </SCRIPT> markup. (later:) Treat <STYLE>...</STYLE> the
+ ** same way. */
+ HtmlScript *pScript = p->pScript;
+ char *zEnd;
+ int nEnd;
+ if( pScript->markup.base.type==Html_SCRIPT ){
+ zEnd = "</script>";
+ nEnd = 9;
+ }else{
+ zEnd = "</style>";
+ nEnd = 8;
+ }
+ if( pScript->zScript==0 ){
+ pScript->zScript = &z[n];
+ pScript->nScript = 0;
+ }
+ for(i=n+pScript->nScript; z[i]; i++){
+ if( z[i]=='<' && z[i+1]=='/' && strnicmp(&z[i],zEnd,nEnd)==0 ){
+ pScript->nScript = i - n;
+ p->pScript = 0;
+ n = i+nEnd;
+ break;
+ }
+ }
+ if( p->pScript ){
+ pScript->nScript = i - n;
+ }
+ continue;
+ }else if( isspace(c) ){
+ /* White space */
+ for(i=0; (c=z[n+i])!=0 && isspace(c) && c!='\n' && c!='\r'; i++){}
+ if( c=='\r' && z[n+i+1]=='\n' ){ i++; }
+ pElem = HtmlAlloc( sizeof(HtmlSpaceElement) );
+ if( pElem==0 ){ goto incomplete; }
+ pElem->base.type = Html_Space;
+ if( c=='\n' || c=='\r' ){
+ pElem->base.flags = HTML_NewLine;
+ pElem->base.count = 1;
+ i++;
+ iCol = 0;
+ TestPoint(0);
+ }else{
+ int iColStart = iCol;
+ pElem->base.flags = 0;
+ for(j=0; j<i; j++){
+ iCol = NextColumn(iCol, z[n+j]);
+ TestPoint(0);
+ }
+ pElem->base.count = iCol - iColStart;
+ }
+ AppendElement(p,pElem);
+ n += i;
+ }else if( c!='<' || p->iPlaintext!=0 ||
+ (!isalpha(z[n+1]) && z[n+1]!='/' && z[n+1]!='!' && z[n+1]!='?') ){
+ /* Ordinary text */
+ for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='<'; i++){}
+ if( c==0 ){ TestPoint(0); goto incomplete; }
+ if( p->iPlaintext!=0 && z[n]=='<' ){
+ switch( p->iPlaintext ){
+ case Html_LISTING:
+ if( i>=10 && strnicmp(&z[n],"</listing>",10)==0 ){
+ p->iPlaintext = 0;
+ goto doMarkup;
+ }
+ break;
+ case Html_XMP:
+ if( i>=6 && strnicmp(&z[n],"</xmp>",6)==0 ){
+ p->iPlaintext = 0;
+ goto doMarkup;
+ }
+ break;
+ case Html_TEXTAREA:
+ if( i>=11 && strnicmp(&z[n],"</textarea>",11)==0 ){
+ p->iPlaintext = 0;
+ goto doMarkup;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ nByte = sizeof(HtmlTextElement) + i;
+ pElem = HtmlAlloc( nByte );
+ if( pElem==0 ){ goto incomplete; }
+ memset(pElem,0,nByte);
+ pElem->base.type = Html_Text;
+ sprintf(pElem->text.zText,"%.*s",i,&z[n]);
+ AppendElement(p,pElem);
+ if( p->iPlaintext==0 || p->iPlaintext==Html_TEXTAREA ){
+ HtmlTranslateEscapes(pElem->text.zText);
+ }
+ pElem->base.count = strlen(pElem->text.zText);
+ n += i;
+ iCol += i;
+ }else if( strncmp(&z[n],"<!--",4)==0 ){
+ /* An HTML comment. Just skip it. */
+ for(i=4; z[n+i]; i++){
+ if( z[n+i]=='-' && strncmp(&z[n+i],"-->",3)==0 ){ break; }
+ }
+ if( z[n+i]==0 ){ TestPoint(0); goto incomplete; }
+ for(j=0; j<i+3; j++){
+ iCol = NextColumn(iCol, z[n+j]);
+ }
+ n += i + 3;
+ }else{
+ /* Markup.
+ **
+ ** First get the name of the markup
+ */
+doMarkup:
+ argc = 1;
+ argv[0] = &z[n+1];
+ for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='>' && (i<2 || c!='/'); i++){}
+ arglen[0] = i - 1;
+ if( c==0 ){ goto incomplete; }
+
+ /*
+ ** Now parse up the arguments
+ */
+ while( isspace(z[n+i]) ){ i++; }
+ while( (c=z[n+i])!=0 && c!='>' && (c!='/' || z[n+i+1]!='>') ){
+ if( argc>mxARG-3 ){
+ argc = mxARG-3;
+ }
+ argv[argc] = &z[n+i];
+ j = 0;
+ while( (c=z[n+i+j])!=0 && !isspace(c) && c!='>'
+ && c!='=' && (c!='/' || z[n+i+j+1]!='>') ){
+ j++;
+ }
+ arglen[argc] = j;
+ if( c==0 ){ goto incomplete; }
+ i += j;
+ while( isspace(c) ){
+ i++;
+ c = z[n+i];
+ }
+ if( c==0 ){ goto incomplete; }
+ argc++;
+ if( c!='=' ){
+ argv[argc] = "";
+ arglen[argc] = 0;
+ argc++;
+ continue;
+ }
+ i++;
+ c = z[n+i];
+ while( isspace(c) ){
+ i++;
+ c = z[n+i];
+ }
+ if( c==0 ){ goto incomplete; }
+ if( c=='\'' || c=='"' ){
+ int cQuote = c;
+ i++;
+ argv[argc] = &z[n+i];
+ for(j=0; (c=z[n+i+j])!=0 && c!=cQuote; j++){}
+ if( c==0 ){ goto incomplete; }
+ arglen[argc] = j;
+ i += j+1;
+ TestPoint(0);
+ }else{
+ argv[argc] = &z[n+i];
+ for(j=0; (c=z[n+i+j])!=0 && !isspace(c) && c!='>'; j++){}
+ if( c==0 ){ goto incomplete; }
+ arglen[argc] = j;
+ i += j;
+ }
+ argc++;
+ while( isspace(z[n+i]) ){ i++; }
+ }
+ if( c=='/' ){
+ i++;
+ c = z[n+i];
+ selfClose = 1;
+ }else{
+ selfClose = 0;
+ }
+ if( c==0 ){ goto incomplete; }
+ for(j=0; j<i+1; j++){
+ iCol = NextColumn(iCol, z[n+j]);
+ }
+ n += i + 1;
+
+ /* Lookup the markup name in the hash table
+ */
+ if( !isInit ){
+ HtmlHashInit();
+ isInit = 1;
+ }
+ c = argv[0][arglen[0]];
+ argv[0][arglen[0]] = 0;
+ h = HtmlHash(argv[0]);
+ for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+ if( stricmp(pMap->zName,argv[0])==0 ){ break; }
+ TestPoint(0);
+ }
+ argv[0][arglen[0]] = c;
+ if( pMap==0 ){ continue; } /* Ignore unknown markup */
+
+makeMarkupEntry:
+ /* Construct a HtmlMarkup entry for this markup.
+ */
+ if( pMap->extra ){
+ nByte = pMap->extra;
+ }else if( argc==1 ){
+ nByte = sizeof(HtmlBaseElement);
+ }else{
+ nByte = sizeof(HtmlMarkupElement);
+ }
+ if( argc>1 ){
+ nByte += sizeof(char*) * argc;
+ for(j=1; j<argc; j++){
+ nByte += arglen[j] + 1;
+ }
+ }
+ pElem = HtmlAlloc( nByte );
+ if( pElem==0 ){ goto incomplete; }
+ memset(pElem,0,nByte);
+ pElem->base.type = pMap->type;
+ pElem->base.count = argc - 1;
+ if( argc>1 ){
+ if( pMap->extra ){
+ pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
+ }else{
+ pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
+ }
+ zBuf = (char*)&pElem->markup.argv[argc];
+ for(j=1; j<argc; j++){
+ pElem->markup.argv[j-1] = zBuf;
+ zBuf += arglen[j] + 1;
+ sprintf(pElem->markup.argv[j-1],"%.*s",arglen[j],argv[j]);
+ HtmlTranslateEscapes(pElem->markup.argv[j-1]);
+ if( (j&1)==1 ){
+ ToLower(pElem->markup.argv[j-1]);
+ }
+ }
+ pElem->markup.argv[argc-1] = 0;
+ }
+
+ /* The new markup has now be constructed in pElem. But before
+ ** appending to the list, check to see if there is a special
+ ** handler for this markup type.
+ */
+ if( p->zHandler[pMap->type] ){
+ Tcl_DString str;
+ Tcl_DStringInit(&str);
+ Tcl_DStringAppend(&str, p->zHandler[pMap->type], -1);
+ Tcl_DStringAppendElement(&str, pMap->zName);
+ Tcl_DStringStartSublist(&str);
+ for(j=0; j<argc-1; j++){
+ Tcl_DStringAppendElement(&str, pElem->markup.argv[j]);
+ }
+ Tcl_DStringEndSublist(&str);
+ HtmlFree(pElem);
+ HtmlLock(p);
+ Tcl_GlobalEval(p->interp, Tcl_DStringValue(&str));
+ Tcl_DStringFree(&str);
+ if( HtmlUnlock(p) ){ return 0; }
+
+ /* Tricky, tricky. The callback might have caused the p->zText
+ ** pointer to change, so renew our copy of that pointer. The
+ ** callback might also have cleared or destroyed the widget.
+ ** If so, abort this routine.
+ */
+ z = p->zText;
+ if( z==0 || p->tkwin==0 ){
+ n = 0;
+ iCol = 0;
+ goto incomplete;
+ }
+ continue;
+ }
+
+ /* No special handler for this markup. Just append it to the
+ ** list of all tokens.
+ */
+ AppendElement(p,pElem);
+ switch( pMap->type ){
+ case Html_PLAINTEXT:
+ case Html_LISTING:
+ case Html_XMP:
+ case Html_TEXTAREA:
+ p->iPlaintext = pMap->type;
+ break;
+ case Html_STYLE:
+ case Html_SCRIPT:
+ p->pScript = (HtmlScript*)pElem;
+ break;
+ default:
+ break;
+ }
+
+ /* If this is self-closing markup (ex: <br/> or <img/>) then
+ ** synthesize a closing token.
+ */
+ if( selfClose && argv[0][0]!='/'
+ && strcmp(&pMap[1].zName[1],pMap->zName)==0 ){
+ selfClose = 0;
+ pMap++;
+ argc = 1;
+ goto makeMarkupEntry;
+ }
+ }
+ }
+incomplete:
+ p->iCol = iCol;
+ return n;
+}
+/************************** End HTML Tokenizer Code ***************************/
+
+/*
+** Append text to the tokenizer engine.
+**
+** This routine (actually the Tokenize() subroutine that is called
+** by this routine) may invoke a callback procedure which could delete
+** the HTML widget.
+*/
+void HtmlTokenizerAppend(HtmlWidget *htmlPtr, const char *zText){
+ int len = strlen(zText);
+ if( htmlPtr->nText==0 ){
+ htmlPtr->nAlloc = len + 100;
+ htmlPtr->zText = HtmlAlloc( htmlPtr->nAlloc );
+ TestPoint(0);
+ }else if( htmlPtr->nText + len >= htmlPtr->nAlloc ){
+ htmlPtr->nAlloc += len + 100;
+ htmlPtr->zText = HtmlRealloc( htmlPtr->zText, htmlPtr->nAlloc );
+ TestPoint(0);
+ }
+ if( htmlPtr->zText==0 ){
+ htmlPtr->nText = 0;
+ UNTESTED;
+ return;
+ }
+ strcpy(&htmlPtr->zText[htmlPtr->nText], zText);
+ htmlPtr->nText += len;
+ htmlPtr->nComplete = Tokenize(htmlPtr);
+}
+
+/*
+** This routine takes a text representation of a token, converts
+** it into an HtmlElement structure and inserts it immediately
+** prior to pToken. If pToken==0, then the newly created HtmlElement
+** is appended.
+**
+** This routine does nothing to resize, restyle, relayout or redisplay
+** the HTML. That is the calling routines responsibility.
+**
+** Return 0 if successful. Return non-zero if zType is not a known
+** markup name.
+*/
+int HtmlInsertToken(
+ HtmlWidget *htmlPtr, /* The widget into which the token is inserted */
+ HtmlElement *pToken, /* Insert before this. Append if pToken==0 */
+ char *zType, /* Type of markup. Ex: "/a" or "table" */
+ char *zArgs /* List of arguments */
+){
+ HtmlTokenMap *pMap; /* For searching the markup name hash table */
+ int h; /* The hash on zType */
+ HtmlElement *pElem; /* The new element */
+ int nByte; /* How many bytes to allocate */
+ int i; /* Loop counter */
+
+ if( !isInit ){
+ HtmlHashInit();
+ isInit = 1;
+ TestPoint(0);
+ }else{
+ TestPoint(0);
+ }
+ h = HtmlHash(zType);
+ for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+ if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
+ TestPoint(0);
+ }
+ if( pMap==0 ){ TestPoint(0); return 1; }
+
+ if( zArgs==0 || *zArgs==0 ){
+ /* Special case of no arguments. This is a lot easier... */
+ nByte = pMap->extra ? pMap->extra : sizeof(HtmlBaseElement);
+ nByte += strlen(zType);
+ pElem = HtmlAlloc( nByte );
+ if( pElem==0 ){ TestPoint(0); return 1; }
+ memset(pElem,0,nByte);
+ pElem->base.type = pMap->type;
+ TestPoint(0);
+ }else{
+ /* The general case. There are arguments that need to be parsed
+ ** up. This is slower, but we gotta do it.
+ */
+ int argc;
+ const char **argv;
+ char *zBuf;
+
+ if( Tcl_SplitList(htmlPtr->interp, zArgs, &argc, (const char***)&argv)!=TCL_OK ){
+ TestPoint(0);
+ return 1;
+ }
+ if( pMap->extra ){
+ nByte = pMap->extra;
+ TestPoint(0);
+ }else{
+ nByte = sizeof(HtmlMarkupElement);
+ TestPoint(0);
+ }
+ nByte += sizeof(char*)*(argc+1) + strlen(zArgs) + argc + 2;
+ pElem = HtmlAlloc( nByte );
+ if( pElem==0 ){
+ HtmlFree(argv);
+ TestPoint(0);
+ return 1;
+ }
+ memset(pElem,0,nByte);
+ pElem->base.type = pMap->type;
+ pElem->base.count = argc;
+ if( pMap->extra ){
+ pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
+ TestPoint(0);
+ }else{
+ pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
+ TestPoint(0);
+ }
+ zBuf = (char*)&pElem->markup.argv[argc];
+ for(i=1; i<argc; i++){
+ pElem->markup.argv[i-1] = zBuf;
+ zBuf += strlen(argv[i]) + 1;
+ strcpy(pElem->markup.argv[i-1],argv[i]);
+ TestPoint(0);
+ }
+ pElem->markup.argv[argc-1] = 0;
+ HtmlFree(argv);
+ TestPoint(0);
+ }
+ if( pToken ){
+ pElem->base.pNext = pToken;
+ pElem->base.pPrev = pToken->base.pPrev;
+ if( pToken->base.pPrev ){
+ pToken->base.pPrev->pNext = pElem;
+ TestPoint(0);
+ }else{
+ htmlPtr->pFirst = pElem;
+ TestPoint(0);
+ }
+ pToken->base.pPrev = pElem;
+ htmlPtr->nToken++;
+ }else{
+ AppendElement(htmlPtr,pElem);
+ TestPoint(0);
+ }
+ return 0;
+}
+
+/*
+** Convert a markup name into a type integer
+*/
+int HtmlNameToType(const char *zType){
+ HtmlTokenMap *pMap; /* For searching the markup name hash table */
+ int h; /* The hash on zType */
+
+ if( !isInit ){
+ HtmlHashInit();
+ isInit = 1;
+ TestPoint(0);
+ }else{
+ TestPoint(0);
+ }
+ h = HtmlHash(zType);
+ for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+ if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
+ TestPoint(0);
+ }
+ return pMap ? pMap->type : Html_Unknown;
+}
+
+/*
+** Convert a type into a symbolic name
+*/
+const char *HtmlTypeToName(int type){
+ if( type>=Html_A && type<=Html_EndXMP ){
+ HtmlTokenMap *pMap = apMap[type - Html_A];
+ TestPoint(0);
+ return pMap->zName;
+ }else{
+ TestPoint(0);
+ return "???";
+ }
+}
+
+/*
+** For debugging purposes, print information about a token
+*/
+char *HtmlTokenName(HtmlElement *p){
+#ifdef DEBUG
+ static char zBuf[200];
+ int j;
+ char *zName;
+
+ if( p==0 ) return "NULL";
+ switch( p->base.type ){
+ case Html_Text:
+ sprintf(zBuf,"\"%.*s\"",p->base.count,p->text.zText);
+ break;
+ case Html_Space:
+ if( p->base.flags & HTML_NewLine ){
+ sprintf(zBuf,"\"\\n\"");
+ }else{
+ sprintf(zBuf,"\" \"");
+ }
+ break;
+ case Html_Block:
+ if( p->block.n>0 ){
+ int n = p->block.n;
+ if( n>150 ) n = 150;
+ sprintf(zBuf,"<Block z=\"%.*s\">", n, p->block.z);
+ }else{
+ sprintf(zBuf,"<Block>");
+ }
+ break;
+ default:
+ if( p->base.type >= HtmlMarkupMap[0].type
+ && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
+ zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
+ }else{
+ zName = "Unknown";
+ }
+ sprintf(zBuf,"<%s",zName);
+ for(j=1; j<p->base.count; j += 2){
+ sprintf(&zBuf[strlen(zBuf)]," %s=%s",
+ p->markup.argv[j-1],p->markup.argv[j]);
+ }
+ strcat(zBuf,">");
+ break;
+ }
+ return zBuf;
+#else
+ return 0;
+#endif
+}
+
+/*
+** Return all tokens between the two elements as a Tcl list.
+*/
+void HtmlTclizeList(Tcl_Interp *interp, HtmlElement *p, HtmlElement *pEnd){
+ Tcl_DString str;
+ int i;
+ char *zName;
+ char zLine[100];
+
+ Tcl_DStringInit(&str);
+ while( p && p!=pEnd ){
+ switch( p->base.type ){
+ case Html_Block:
+ break;
+ case Html_Text:
+ Tcl_DStringStartSublist(&str);
+ Tcl_DStringAppendElement(&str,"Text");
+ Tcl_DStringAppendElement(&str, p->text.zText);
+ Tcl_DStringEndSublist(&str);
+ break;
+ case Html_Space:
+ sprintf(zLine,"Space %d %d",
+ p->base.count, (p->base.flags & HTML_NewLine)!=0);
+ Tcl_DStringAppendElement(&str,zLine);
+ break;
+ case Html_Unknown:
+ Tcl_DStringAppendElement(&str,"Unknown");
+ break;
+ default:
+ Tcl_DStringStartSublist(&str);
+ Tcl_DStringAppendElement(&str,"Markup");
+ if( p->base.type >= HtmlMarkupMap[0].type
+ && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
+ zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
+ }else{
+ zName = "Unknown";
+ }
+ Tcl_DStringAppendElement(&str, zName);
+ for(i=0; i<p->base.count; i++){
+ Tcl_DStringAppendElement(&str, p->markup.argv[i]);
+ }
+ Tcl_DStringEndSublist(&str);
+ break;
+ }
+ p = p->pNext;
+ }
+ Tcl_DStringResult(interp, &str);
+}
+
+/*
+** Print a list of tokens
+*/
+#ifdef DEBUG
+void HtmlPrintList(HtmlElement *p, HtmlElement *pEnd){
+ while( p && p!=pEnd ){
+ if( p->base.type==Html_Block ){
+ char *z = p->block.z;
+ int n = p->block.n;
+ if( n==0 || z==0 ){
+ n = 1;
+ z = "";
+ }
+ printf("Block 0x%08x flags=%02x cnt=%d x=%d..%d y=%d..%d z=\"%.*s\"\n",
+ (int)p, p->base.flags, p->base.count, p->block.left, p->block.right,
+ p->block.top, p->block.bottom, n, z);
+ }else{
+ printf("Token 0x%08x font=%2d color=%2d align=%d flags=0x%04x name=%s\n",
+ (int)p, p->base.style.font, p->base.style.color,
+ p->base.style.align, p->base.style.flags, HtmlTokenName(p));
+ }
+ p = p->pNext;
+ }
+}
+#endif