summaryrefslogtreecommitdiffstats
path: root/tkhtml1/src/htmlparse.c
diff options
context:
space:
mode:
Diffstat (limited to 'tkhtml1/src/htmlparse.c')
-rw-r--r--tkhtml1/src/htmlparse.c1181
1 files changed, 0 insertions, 1181 deletions
diff --git a/tkhtml1/src/htmlparse.c b/tkhtml1/src/htmlparse.c
deleted file mode 100644
index 4511005..0000000
--- a/tkhtml1/src/htmlparse.c
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*
-** A tokenizer that converts raw HTML into a linked list of HTML elements.
-**
-** Copyright (C) 1997-2000 D. Richard Hipp
-**
-** This library is free software; you can redistribute it and/or
-** modify it under the terms of the GNU Library General Public
-** License as published by the Free Software Foundation; either
-** version 2 of the License, or (at your option) any later version.
-**
-** This library is distributed in the hope that it will be useful,
-** but WITHOUT ANY WARRANTY; without even the implied warranty of
-** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-** Library General Public License for more details.
-**
-** You should have received a copy of the GNU Library General Public
-** License along with this library; if not, write to the
-** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-** Boston, MA 02111-1307, USA.
-**
-** Author contact information:
-** drh@acm.org
-** http://www.hwaci.com/drh/
-*/
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <tk.h>
-#include "htmlparse.h"
-
-/****************** Begin Escape Sequence Translator *************/
-/*
-** The next section of code implements routines used to translate
-** the '&' escape sequences of SGML to individual characters.
-** Examples:
-**
-** &amp; &
-** &lt; <
-** &gt; >
-** &nbsp; nonbreakable space
-*/
-
-/* Each escape sequence is recorded as an instance of the following
-** structure
-*/
-struct sgEsc {
- char *zName; /* The name of this escape sequence. ex: "amp" */
- char value[8]; /* The value for this sequence. ex: "&" */
- struct sgEsc *pNext; /* Next sequence with the same hash on zName */
-};
-
-/* The following is a table of all escape sequences. Add new sequences
-** by adding entries to this table.
-*/
-static struct sgEsc esc_sequences[] = {
- { "quot", "\"", 0 },
- { "amp", "&", 0 },
- { "lt", "<", 0 },
- { "gt", ">", 0 },
- { "nbsp", " ", 0 },
- { "iexcl", "\241", 0 },
- { "cent", "\242", 0 },
- { "pound", "\243", 0 },
- { "curren", "\244", 0 },
- { "yen", "\245", 0 },
- { "brvbar", "\246", 0 },
- { "sect", "\247", 0 },
- { "uml", "\250", 0 },
- { "copy", "\251", 0 },
- { "ordf", "\252", 0 },
- { "laquo", "\253", 0 },
- { "not", "\254", 0 },
- { "shy", "\255", 0 },
- { "reg", "\256", 0 },
- { "macr", "\257", 0 },
- { "deg", "\260", 0 },
- { "plusmn", "\261", 0 },
- { "sup2", "\262", 0 },
- { "sup3", "\263", 0 },
- { "acute", "\264", 0 },
- { "micro", "\265", 0 },
- { "para", "\266", 0 },
- { "middot", "\267", 0 },
- { "cedil", "\270", 0 },
- { "sup1", "\271", 0 },
- { "ordm", "\272", 0 },
- { "raquo", "\273", 0 },
- { "frac14", "\274", 0 },
- { "frac12", "\275", 0 },
- { "frac34", "\276", 0 },
- { "iquest", "\277", 0 },
- { "Agrave", "\300", 0 },
- { "Aacute", "\301", 0 },
- { "Acirc", "\302", 0 },
- { "Atilde", "\303", 0 },
- { "Auml", "\304", 0 },
- { "Aring", "\305", 0 },
- { "AElig", "\306", 0 },
- { "Ccedil", "\307", 0 },
- { "Egrave", "\310", 0 },
- { "Eacute", "\311", 0 },
- { "Ecirc", "\312", 0 },
- { "Euml", "\313", 0 },
- { "Igrave", "\314", 0 },
- { "Iacute", "\315", 0 },
- { "Icirc", "\316", 0 },
- { "Iuml", "\317", 0 },
- { "ETH", "\320", 0 },
- { "Ntilde", "\321", 0 },
- { "Ograve", "\322", 0 },
- { "Oacute", "\323", 0 },
- { "Ocirc", "\324", 0 },
- { "Otilde", "\325", 0 },
- { "Ouml", "\326", 0 },
- { "times", "\327", 0 },
- { "Oslash", "\330", 0 },
- { "Ugrave", "\331", 0 },
- { "Uacute", "\332", 0 },
- { "Ucirc", "\333", 0 },
- { "Uuml", "\334", 0 },
- { "Yacute", "\335", 0 },
- { "THORN", "\336", 0 },
- { "szlig", "\337", 0 },
- { "agrave", "\340", 0 },
- { "aacute", "\341", 0 },
- { "acirc", "\342", 0 },
- { "atilde", "\343", 0 },
- { "auml", "\344", 0 },
- { "aring", "\345", 0 },
- { "aelig", "\346", 0 },
- { "ccedil", "\347", 0 },
- { "egrave", "\350", 0 },
- { "eacute", "\351", 0 },
- { "ecirc", "\352", 0 },
- { "euml", "\353", 0 },
- { "igrave", "\354", 0 },
- { "iacute", "\355", 0 },
- { "icirc", "\356", 0 },
- { "iuml", "\357", 0 },
- { "eth", "\360", 0 },
- { "ntilde", "\361", 0 },
- { "ograve", "\362", 0 },
- { "oacute", "\363", 0 },
- { "ocirc", "\364", 0 },
- { "otilde", "\365", 0 },
- { "ouml", "\366", 0 },
- { "divide", "\367", 0 },
- { "oslash", "\370", 0 },
- { "ugrave", "\371", 0 },
- { "uacute", "\372", 0 },
- { "ucirc", "\373", 0 },
- { "uuml", "\374", 0 },
- { "yacute", "\375", 0 },
- { "thorn", "\376", 0 },
- { "yuml", "\377", 0 },
-};
-
-/* The size of the handler hash table. For best results this should
-** be a prime number which is about the same size as the number of
-** escape sequences known to the system. */
-#define ESC_HASH_SIZE (sizeof(esc_sequences)/sizeof(esc_sequences[0])+7)
-
-/* The hash table
-**
-** If the name of an escape sequences hashes to the value H, then
-** apEscHash[H] will point to a linked list of Esc structures, one of
-** which will be the Esc structure for that escape sequence.
-*/
-static struct sgEsc *apEscHash[ESC_HASH_SIZE];
-
-/* Hash a escape sequence name. The value returned is an integer
-** between 0 and ESC_HASH_SIZE-1, inclusive.
-*/
-static int EscHash(const char *zName){
- int h = 0; /* The hash value to be returned */
- char c; /* The next character in the name being hashed */
-
- while( (c=*zName)!=0 ){
- h = h<<5 ^ h ^ c;
- zName++;
- TestPoint(0);
- }
- if( h<0 ){
- h = -h;
- TestPoint(0);
- }else{
- TestPoint(0);
- }
- return h % ESC_HASH_SIZE;
-}
-
-#ifdef TEST
-/*
-** Compute the longest and average collision chain length for the
-** escape sequence hash table
-*/
-static void EscHashStats(void){
- int i;
- int sum = 0;
- int max = 0;
- int cnt;
- int notempty = 0;
- struct sgEsc *p;
-
- for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[0]); i++){
- cnt = 0;
- p = apEscHash[i];
- if( p ) notempty++;
- while( p ){
- cnt++;
- p = p->pNext;
- }
- sum += cnt;
- if( cnt>max ) max = cnt;
- }
- printf("Longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
- max,(double)sum/(double)notempty, i, i-notempty,
- 100.0*(i-notempty)/(double)i);
-}
-#endif
-
-/* Initialize the escape sequence hash table
-*/
-static void EscInit(void){
- int i; /* For looping thru the list of escape sequences */
- int h; /* The hash on a sequence */
-
- for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[i]); i++){
-/* #ifdef TCL_UTF_MAX */
-#if 0
- {
- int c = esc_sequences[i].value[0];
- Tcl_UniCharToUtf(c, esc_sequences[i].value);
- }
-#endif
- h = EscHash(esc_sequences[i].zName);
- esc_sequences[i].pNext = apEscHash[h];
- apEscHash[h] = &esc_sequences[i];
- TestPoint(0);
- }
-#ifdef TEST
- EscHashStats();
-#endif
-}
-
-/*
-** This table translates the non-standard microsoft characters between
-** 0x80 and 0x9f into plain ASCII so that the characters will be visible
-** on Unix systems. Care is taken to translate the characters
-** into values less than 0x80, to avoid UTF-8 problems.
-*/
-#ifndef __WIN32__
-static char acMsChar[] = {
- /* 0x80 */ 'C',
- /* 0x81 */ ' ',
- /* 0x82 */ ',',
- /* 0x83 */ 'f',
- /* 0x84 */ '"',
- /* 0x85 */ '.',
- /* 0x86 */ '*',
- /* 0x87 */ '*',
- /* 0x88 */ '^',
- /* 0x89 */ '%',
- /* 0x8a */ 'S',
- /* 0x8b */ '<',
- /* 0x8c */ 'O',
- /* 0x8d */ ' ',
- /* 0x8e */ 'Z',
- /* 0x8f */ ' ',
- /* 0x90 */ ' ',
- /* 0x91 */ '\'',
- /* 0x92 */ '\'',
- /* 0x93 */ '"',
- /* 0x94 */ '"',
- /* 0x95 */ '*',
- /* 0x96 */ '-',
- /* 0x97 */ '-',
- /* 0x98 */ '~',
- /* 0x99 */ '@',
- /* 0x9a */ 's',
- /* 0x9b */ '>',
- /* 0x9c */ 'o',
- /* 0x9d */ ' ',
- /* 0x9e */ 'z',
- /* 0x9f */ 'Y',
-};
-#endif
-
-/* Translate escape sequences in the string "z". "z" is overwritten
-** with the translated sequence.
-**
-** Unrecognized escape sequences are unaltered.
-**
-** Example:
-**
-** input = "AT&amp;T &gt MCI"
-** output = "AT&T > MCI"
-*/
-LOCAL void HtmlTranslateEscapes(char *z){
- int from; /* Read characters from this position in z[] */
- int to; /* Write characters into this position in z[] */
- int h; /* A hash on the escape sequence */
- struct sgEsc *p; /* For looping down the escape sequence collision chain */
- static int isInit = 0; /* True after initialization */
-
- from = to = 0;
- if( !isInit ){
- EscInit();
- isInit = 1;
- }
- while( z[from] ){
- if( z[from]=='&' ){
- if( z[from+1]=='#' ){
- int i = from + 2;
- int v = 0;
- while( isdigit(z[i]) ){
- v = v*10 + z[i] - '0';
- i++;
- }
- if( z[i]==';' ){ i++; }
-
- /* On Unix systems, translate the non-standard microsoft
- ** characters in the range of 0x80 to 0x9f into something
- ** we can see.
- */
-#ifndef __WIN32__
- if( v>=0x80 && v<0xa0 ){
- v = acMsChar[v&0x1f];
- }
-#endif
- /* Put the character in the output stream in place of
- ** the "&#000;". How we do this depends on whether or
- ** not we are using UTF-8.
- */
-#ifdef TCL_UTF_MAX
- {
- int j, n;
- char value[8];
- n = Tcl_UniCharToUtf(v,value);
- for(j=0; j<n; j++){
- z[to++] = value[j];
- }
- }
-#else
- z[to++] = v;
-#endif
- from = i;
- }else{
- int i = from+1;
- int c;
- while( z[i] && isalnum(z[i]) ){ TestPoint(0); i++; }
- c = z[i];
- z[i] = 0;
- h = EscHash(&z[from+1]);
- p = apEscHash[h];
- while( p && strcmp(p->zName,&z[from+1])!=0 ){
- p = p->pNext;
- }
- z[i] = c;
- if( p ){
- int j;
- for(j=0; p->value[j]; j++){
- z[to++] = p->value[j];
- }
- from = i;
- if( c==';' ){
- from++;
- }
- }else{
- z[to++] = z[from++];
- }
- }
-
- /* On UNIX systems, look for the non-standard microsoft characters
- ** between 0x80 and 0x9f and translate them into printable ASCII
- ** codes. Separate algorithms are required to do this for plain
- ** ascii and for utf-8.
- */
-#ifndef __WIN32__
-#ifdef TCL_UTF_MAX
- }else if( (z[from]&0x80)!=0 ){
- Tcl_UniChar c;
- int n;
- n = Tcl_UtfToUniChar(&z[from], &c);
- if( c>=0x80 && c<0xa0 ){
- z[to++] = acMsChar[c & 0x1f];
- from += n;
- }else{
- while( n-- ) z[to++] = z[from++];
- }
-#else /* if !defined(TCL_UTF_MAX) */
- }else if( ((unsigned char)z[from])>=0x80 && ((unsigned char)z[from])<0xa0 ){
- z[to++] = acMsChar[z[from++]&0x1f];
-#endif /* TCL_UTF_MAX */
-#endif /* __WIN32__ */
- }else{
- z[to++] = z[from++];
- TestPoint(0);
- }
- }
- z[to] = 0;
-}
-/******************* End Escape Sequence Translator ***************/
-
-/******************* Begin HTML tokenizer code *******************/
-/*
-** The following variable becomes TRUE when the markup hash table
-** (stored in HtmlMarkupMap[]) is initialized.
-*/
-static int isInit = 0;
-
-/* The hash table for HTML markup names.
-**
-** If an HTML markup name hashes to H, then apMap[H] will point to
-** a linked list of sgMap structure, one of which will describe the
-** the particular markup (if it exists.)
-*/
-static HtmlTokenMap *apMap[HTML_MARKUP_HASH_SIZE];
-
-/* Hash a markup name
-**
-** HTML markup is case insensitive, so this function will give the
-** same hash regardless of the case of the markup name.
-**
-** The value returned is an integer between 0 and HTML_MARKUP_HASH_SIZE-1,
-** inclusive.
-*/
-static int HtmlHash(const char *zName){
- int h = 0;
- char c;
- while( (c=*zName)!=0 ){
- if( isupper(c) ){
- c = tolower(c);
- }
- h = h<<5 ^ h ^ c;
- zName++;
- }
- if( h<0 ){
- h = -h;
- }
- return h % HTML_MARKUP_HASH_SIZE;
-}
-
-#ifdef TEST
-/*
-** Compute the longest and average collision chain length for the
-** markup hash table
-*/
-static void HtmlHashStats(void){
- int i;
- int sum = 0;
- int max = 0;
- int cnt;
- int notempty = 0;
- struct sgMap *p;
-
- for(i=0; i<HTML_MARKUP_COUNT; i++){
- cnt = 0;
- p = apMap[i];
- if( p ) notempty++;
- while( p ){
- cnt++;
- p = p->pCollide;
- }
- sum += cnt;
- if( cnt>max ) max = cnt;
-
- }
- printf("longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
- max, (double)sum/(double)notempty, i, i-notempty,
- 100.0*(i-notempty)/(double)i);
-}
-#endif
-
-/* Initialize the escape sequence hash table
-*/
-static void HtmlHashInit(void){
- int i; /* For looping thru the list of markup names */
- int h; /* The hash on a markup name */
-
- for(i=0; i<HTML_MARKUP_COUNT; i++){
- h = HtmlHash(HtmlMarkupMap[i].zName);
- HtmlMarkupMap[i].pCollide = apMap[h];
- apMap[h] = &HtmlMarkupMap[i];
- }
-#ifdef TEST
- HtmlHashStats();
-#endif
-}
-
-/*
-** Append the given HtmlElement to the tokenizers list of elements
-*/
-static void AppendElement(HtmlWidget *p, HtmlElement *pElem){
- pElem->base.pNext = 0;
- pElem->base.pPrev = p->pLast;
- if( p->pFirst==0 ){
- p->pFirst = pElem;
- }else{
- p->pLast->base.pNext = pElem;
- }
- p->pLast = pElem;
- p->nToken++;
-}
-
-/*
-** Compute the new column index following the given character.
-*/
-static int NextColumn(int iCol, char c){
- switch( c ){
- case '\n': return 0;
- case '\t': return (iCol | 7) + 1;
- default: return iCol+1;
- }
- /* NOT REACHED */
-}
-
-/*
-** Convert a string to all lower-case letters.
-*/
-static void ToLower(char *z){
- while( *z ){
- if( isupper(*z) ) *z = tolower(*z);
- z++;
- }
-}
-
-/* Process as much of the input HTML as possible. Construct new
-** HtmlElement structures and appended them to the list. Return
-** the number of characters actually processed.
-**
-** This routine may invoke a callback procedure which could delete
-** the HTML widget.
-**
-** This routine is not reentrant for the same HTML widget. To
-** prevent reentrancy (during a callback), the p->iCol field is
-** set to a negative number. This is a flag to future invocations
-** not to reentry this routine. The p->iCol field is restored
-** before exiting, of course.
-*/
-static int Tokenize(
- HtmlWidget *p /* The HTML widget doing the parsing */
-){
- char *z; /* The input HTML text */
- int c; /* The next character of input */
- int n; /* Number of characters processed so far */
- int iCol; /* Column of input */
- int i, j; /* Loop counters */
- int h; /* Result from HtmlHash() */
- int nByte; /* Space allocated for a single HtmlElement */
- HtmlElement *pElem; /* A new HTML element */
- int selfClose; /* True for content free elements. Ex: <br/> */
- int argc; /* The number of arguments on a markup */
- HtmlTokenMap *pMap; /* For searching the markup name hash table */
- char *zBuf; /* For handing out buffer space */
-# define mxARG 200 /* Maximum number of parameters in a single markup */
- char *argv[mxARG]; /* Pointers to each markup argument. */
- int arglen[mxARG]; /* Length of each markup argument */
-
- iCol = p->iCol;
- n = p->nComplete;
- z = p->zText;
- if( iCol<0 ){ TestPoint(0); return n; } /* Prevents recursion */
- p->iCol = -1;
- while( (c=z[n])!=0 ){
- if( p->pScript ){
- /* We are in the middle of <SCRIPT>...</SCRIPT>. Just look for
- ** the </SCRIPT> markup. (later:) Treat <STYLE>...</STYLE> the
- ** same way. */
- HtmlScript *pScript = p->pScript;
- char *zEnd;
- int nEnd;
- if( pScript->markup.base.type==Html_SCRIPT ){
- zEnd = "</script>";
- nEnd = 9;
- }else{
- zEnd = "</style>";
- nEnd = 8;
- }
- if( pScript->zScript==0 ){
- pScript->zScript = &z[n];
- pScript->nScript = 0;
- }
- for(i=n+pScript->nScript; z[i]; i++){
- if( z[i]=='<' && z[i+1]=='/' && strnicmp(&z[i],zEnd,nEnd)==0 ){
- pScript->nScript = i - n;
- p->pScript = 0;
- n = i+nEnd;
- break;
- }
- }
- if( p->pScript ){
- pScript->nScript = i - n;
- }
- continue;
- }else if( isspace(c) ){
- /* White space */
- for(i=0; (c=z[n+i])!=0 && isspace(c) && c!='\n' && c!='\r'; i++){}
- if( c=='\r' && z[n+i+1]=='\n' ){ i++; }
- pElem = HtmlAlloc( sizeof(HtmlSpaceElement) );
- if( pElem==0 ){ goto incomplete; }
- pElem->base.type = Html_Space;
- if( c=='\n' || c=='\r' ){
- pElem->base.flags = HTML_NewLine;
- pElem->base.count = 1;
- i++;
- iCol = 0;
- TestPoint(0);
- }else{
- int iColStart = iCol;
- pElem->base.flags = 0;
- for(j=0; j<i; j++){
- iCol = NextColumn(iCol, z[n+j]);
- TestPoint(0);
- }
- pElem->base.count = iCol - iColStart;
- }
- AppendElement(p,pElem);
- n += i;
- }else if( c!='<' || p->iPlaintext!=0 ||
- (!isalpha(z[n+1]) && z[n+1]!='/' && z[n+1]!='!' && z[n+1]!='?') ){
- /* Ordinary text */
- for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='<'; i++){}
- if( c==0 ){ TestPoint(0); goto incomplete; }
- if( p->iPlaintext!=0 && z[n]=='<' ){
- switch( p->iPlaintext ){
- case Html_LISTING:
- if( i>=10 && strnicmp(&z[n],"</listing>",10)==0 ){
- p->iPlaintext = 0;
- goto doMarkup;
- }
- break;
- case Html_XMP:
- if( i>=6 && strnicmp(&z[n],"</xmp>",6)==0 ){
- p->iPlaintext = 0;
- goto doMarkup;
- }
- break;
- case Html_TEXTAREA:
- if( i>=11 && strnicmp(&z[n],"</textarea>",11)==0 ){
- p->iPlaintext = 0;
- goto doMarkup;
- }
- break;
- default:
- break;
- }
- }
- nByte = sizeof(HtmlTextElement) + i;
- pElem = HtmlAlloc( nByte );
- if( pElem==0 ){ goto incomplete; }
- memset(pElem,0,nByte);
- pElem->base.type = Html_Text;
- sprintf(pElem->text.zText,"%.*s",i,&z[n]);
- AppendElement(p,pElem);
- if( p->iPlaintext==0 || p->iPlaintext==Html_TEXTAREA ){
- HtmlTranslateEscapes(pElem->text.zText);
- }
- pElem->base.count = strlen(pElem->text.zText);
- n += i;
- iCol += i;
- }else if( strncmp(&z[n],"<!--",4)==0 ){
- /* An HTML comment. Just skip it. */
- for(i=4; z[n+i]; i++){
- if( z[n+i]=='-' && strncmp(&z[n+i],"-->",3)==0 ){ break; }
- }
- if( z[n+i]==0 ){ TestPoint(0); goto incomplete; }
- for(j=0; j<i+3; j++){
- iCol = NextColumn(iCol, z[n+j]);
- }
- n += i + 3;
- }else{
- /* Markup.
- **
- ** First get the name of the markup
- */
-doMarkup:
- argc = 1;
- argv[0] = &z[n+1];
- for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='>' && (i<2 || c!='/'); i++){}
- arglen[0] = i - 1;
- if( c==0 ){ goto incomplete; }
-
- /*
- ** Now parse up the arguments
- */
- while( isspace(z[n+i]) ){ i++; }
- while( (c=z[n+i])!=0 && c!='>' && (c!='/' || z[n+i+1]!='>') ){
- if( argc>mxARG-3 ){
- argc = mxARG-3;
- }
- argv[argc] = &z[n+i];
- j = 0;
- while( (c=z[n+i+j])!=0 && !isspace(c) && c!='>'
- && c!='=' && (c!='/' || z[n+i+j+1]!='>') ){
- j++;
- }
- arglen[argc] = j;
- if( c==0 ){ goto incomplete; }
- i += j;
- while( isspace(c) ){
- i++;
- c = z[n+i];
- }
- if( c==0 ){ goto incomplete; }
- argc++;
- if( c!='=' ){
- argv[argc] = "";
- arglen[argc] = 0;
- argc++;
- continue;
- }
- i++;
- c = z[n+i];
- while( isspace(c) ){
- i++;
- c = z[n+i];
- }
- if( c==0 ){ goto incomplete; }
- if( c=='\'' || c=='"' ){
- int cQuote = c;
- i++;
- argv[argc] = &z[n+i];
- for(j=0; (c=z[n+i+j])!=0 && c!=cQuote; j++){}
- if( c==0 ){ goto incomplete; }
- arglen[argc] = j;
- i += j+1;
- TestPoint(0);
- }else{
- argv[argc] = &z[n+i];
- for(j=0; (c=z[n+i+j])!=0 && !isspace(c) && c!='>'; j++){}
- if( c==0 ){ goto incomplete; }
- arglen[argc] = j;
- i += j;
- }
- argc++;
- while( isspace(z[n+i]) ){ i++; }
- }
- if( c=='/' ){
- i++;
- c = z[n+i];
- selfClose = 1;
- }else{
- selfClose = 0;
- }
- if( c==0 ){ goto incomplete; }
- for(j=0; j<i+1; j++){
- iCol = NextColumn(iCol, z[n+j]);
- }
- n += i + 1;
-
- /* Lookup the markup name in the hash table
- */
- if( !isInit ){
- HtmlHashInit();
- isInit = 1;
- }
- c = argv[0][arglen[0]];
- argv[0][arglen[0]] = 0;
- h = HtmlHash(argv[0]);
- for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
- if( stricmp(pMap->zName,argv[0])==0 ){ break; }
- TestPoint(0);
- }
- argv[0][arglen[0]] = c;
- if( pMap==0 ){ continue; } /* Ignore unknown markup */
-
-makeMarkupEntry:
- /* Construct a HtmlMarkup entry for this markup.
- */
- if( pMap->extra ){
- nByte = pMap->extra;
- }else if( argc==1 ){
- nByte = sizeof(HtmlBaseElement);
- }else{
- nByte = sizeof(HtmlMarkupElement);
- }
- if( argc>1 ){
- nByte += sizeof(char*) * argc;
- for(j=1; j<argc; j++){
- nByte += arglen[j] + 1;
- }
- }
- pElem = HtmlAlloc( nByte );
- if( pElem==0 ){ goto incomplete; }
- memset(pElem,0,nByte);
- pElem->base.type = pMap->type;
- pElem->base.count = argc - 1;
- if( argc>1 ){
- if( pMap->extra ){
- pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
- }else{
- pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
- }
- zBuf = (char*)&pElem->markup.argv[argc];
- for(j=1; j<argc; j++){
- pElem->markup.argv[j-1] = zBuf;
- zBuf += arglen[j] + 1;
- sprintf(pElem->markup.argv[j-1],"%.*s",arglen[j],argv[j]);
- HtmlTranslateEscapes(pElem->markup.argv[j-1]);
- if( (j&1)==1 ){
- ToLower(pElem->markup.argv[j-1]);
- }
- }
- pElem->markup.argv[argc-1] = 0;
- }
-
- /* The new markup has now be constructed in pElem. But before
- ** appending to the list, check to see if there is a special
- ** handler for this markup type.
- */
- if( p->zHandler[pMap->type] ){
- Tcl_DString str;
- Tcl_DStringInit(&str);
- Tcl_DStringAppend(&str, p->zHandler[pMap->type], -1);
- Tcl_DStringAppendElement(&str, pMap->zName);
- Tcl_DStringStartSublist(&str);
- for(j=0; j<argc-1; j++){
- Tcl_DStringAppendElement(&str, pElem->markup.argv[j]);
- }
- Tcl_DStringEndSublist(&str);
- HtmlFree(pElem);
- HtmlLock(p);
- Tcl_GlobalEval(p->interp, Tcl_DStringValue(&str));
- Tcl_DStringFree(&str);
- if( HtmlUnlock(p) ){ return 0; }
-
- /* Tricky, tricky. The callback might have caused the p->zText
- ** pointer to change, so renew our copy of that pointer. The
- ** callback might also have cleared or destroyed the widget.
- ** If so, abort this routine.
- */
- z = p->zText;
- if( z==0 || p->tkwin==0 ){
- n = 0;
- iCol = 0;
- goto incomplete;
- }
- continue;
- }
-
- /* No special handler for this markup. Just append it to the
- ** list of all tokens.
- */
- AppendElement(p,pElem);
- switch( pMap->type ){
- case Html_PLAINTEXT:
- case Html_LISTING:
- case Html_XMP:
- case Html_TEXTAREA:
- p->iPlaintext = pMap->type;
- break;
- case Html_STYLE:
- case Html_SCRIPT:
- p->pScript = (HtmlScript*)pElem;
- break;
- default:
- break;
- }
-
- /* If this is self-closing markup (ex: <br/> or <img/>) then
- ** synthesize a closing token.
- */
- if( selfClose && argv[0][0]!='/'
- && strcmp(&pMap[1].zName[1],pMap->zName)==0 ){
- selfClose = 0;
- pMap++;
- argc = 1;
- goto makeMarkupEntry;
- }
- }
- }
-incomplete:
- p->iCol = iCol;
- return n;
-}
-/************************** End HTML Tokenizer Code ***************************/
-
-/*
-** Append text to the tokenizer engine.
-**
-** This routine (actually the Tokenize() subroutine that is called
-** by this routine) may invoke a callback procedure which could delete
-** the HTML widget.
-*/
-void HtmlTokenizerAppend(HtmlWidget *htmlPtr, const char *zText){
- int len = strlen(zText);
- if( htmlPtr->nText==0 ){
- htmlPtr->nAlloc = len + 100;
- htmlPtr->zText = HtmlAlloc( htmlPtr->nAlloc );
- TestPoint(0);
- }else if( htmlPtr->nText + len >= htmlPtr->nAlloc ){
- htmlPtr->nAlloc += len + 100;
- htmlPtr->zText = HtmlRealloc( htmlPtr->zText, htmlPtr->nAlloc );
- TestPoint(0);
- }
- if( htmlPtr->zText==0 ){
- htmlPtr->nText = 0;
- UNTESTED;
- return;
- }
- strcpy(&htmlPtr->zText[htmlPtr->nText], zText);
- htmlPtr->nText += len;
- htmlPtr->nComplete = Tokenize(htmlPtr);
-}
-
-/*
-** This routine takes a text representation of a token, converts
-** it into an HtmlElement structure and inserts it immediately
-** prior to pToken. If pToken==0, then the newly created HtmlElement
-** is appended.
-**
-** This routine does nothing to resize, restyle, relayout or redisplay
-** the HTML. That is the calling routines responsibility.
-**
-** Return 0 if successful. Return non-zero if zType is not a known
-** markup name.
-*/
-int HtmlInsertToken(
- HtmlWidget *htmlPtr, /* The widget into which the token is inserted */
- HtmlElement *pToken, /* Insert before this. Append if pToken==0 */
- char *zType, /* Type of markup. Ex: "/a" or "table" */
- char *zArgs /* List of arguments */
-){
- HtmlTokenMap *pMap; /* For searching the markup name hash table */
- int h; /* The hash on zType */
- HtmlElement *pElem; /* The new element */
- int nByte; /* How many bytes to allocate */
- int i; /* Loop counter */
-
- if( !isInit ){
- HtmlHashInit();
- isInit = 1;
- TestPoint(0);
- }else{
- TestPoint(0);
- }
- h = HtmlHash(zType);
- for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
- if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
- TestPoint(0);
- }
- if( pMap==0 ){ TestPoint(0); return 1; }
-
- if( zArgs==0 || *zArgs==0 ){
- /* Special case of no arguments. This is a lot easier... */
- nByte = pMap->extra ? pMap->extra : sizeof(HtmlBaseElement);
- nByte += strlen(zType);
- pElem = HtmlAlloc( nByte );
- if( pElem==0 ){ TestPoint(0); return 1; }
- memset(pElem,0,nByte);
- pElem->base.type = pMap->type;
- TestPoint(0);
- }else{
- /* The general case. There are arguments that need to be parsed
- ** up. This is slower, but we gotta do it.
- */
- int argc;
- const char **argv;
- char *zBuf;
-
- if( Tcl_SplitList(htmlPtr->interp, zArgs, &argc, (const char***)&argv)!=TCL_OK ){
- TestPoint(0);
- return 1;
- }
- if( pMap->extra ){
- nByte = pMap->extra;
- TestPoint(0);
- }else{
- nByte = sizeof(HtmlMarkupElement);
- TestPoint(0);
- }
- nByte += sizeof(char*)*(argc+1) + strlen(zArgs) + argc + 2;
- pElem = HtmlAlloc( nByte );
- if( pElem==0 ){
- HtmlFree(argv);
- TestPoint(0);
- return 1;
- }
- memset(pElem,0,nByte);
- pElem->base.type = pMap->type;
- pElem->base.count = argc;
- if( pMap->extra ){
- pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
- TestPoint(0);
- }else{
- pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
- TestPoint(0);
- }
- zBuf = (char*)&pElem->markup.argv[argc];
- for(i=1; i<argc; i++){
- pElem->markup.argv[i-1] = zBuf;
- zBuf += strlen(argv[i]) + 1;
- strcpy(pElem->markup.argv[i-1],argv[i]);
- TestPoint(0);
- }
- pElem->markup.argv[argc-1] = 0;
- HtmlFree(argv);
- TestPoint(0);
- }
- if( pToken ){
- pElem->base.pNext = pToken;
- pElem->base.pPrev = pToken->base.pPrev;
- if( pToken->base.pPrev ){
- pToken->base.pPrev->pNext = pElem;
- TestPoint(0);
- }else{
- htmlPtr->pFirst = pElem;
- TestPoint(0);
- }
- pToken->base.pPrev = pElem;
- htmlPtr->nToken++;
- }else{
- AppendElement(htmlPtr,pElem);
- TestPoint(0);
- }
- return 0;
-}
-
-/*
-** Convert a markup name into a type integer
-*/
-int HtmlNameToType(const char *zType){
- HtmlTokenMap *pMap; /* For searching the markup name hash table */
- int h; /* The hash on zType */
-
- if( !isInit ){
- HtmlHashInit();
- isInit = 1;
- TestPoint(0);
- }else{
- TestPoint(0);
- }
- h = HtmlHash(zType);
- for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
- if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
- TestPoint(0);
- }
- return pMap ? pMap->type : Html_Unknown;
-}
-
-/*
-** Convert a type into a symbolic name
-*/
-const char *HtmlTypeToName(int type){
- if( type>=Html_A && type<=Html_EndXMP ){
- HtmlTokenMap *pMap = apMap[type - Html_A];
- TestPoint(0);
- return pMap->zName;
- }else{
- TestPoint(0);
- return "???";
- }
-}
-
-/*
-** For debugging purposes, print information about a token
-*/
-char *HtmlTokenName(HtmlElement *p){
-#ifdef DEBUG
- static char zBuf[200];
- int j;
- char *zName;
-
- if( p==0 ) return "NULL";
- switch( p->base.type ){
- case Html_Text:
- sprintf(zBuf,"\"%.*s\"",p->base.count,p->text.zText);
- break;
- case Html_Space:
- if( p->base.flags & HTML_NewLine ){
- sprintf(zBuf,"\"\\n\"");
- }else{
- sprintf(zBuf,"\" \"");
- }
- break;
- case Html_Block:
- if( p->block.n>0 ){
- int n = p->block.n;
- if( n>150 ) n = 150;
- sprintf(zBuf,"<Block z=\"%.*s\">", n, p->block.z);
- }else{
- sprintf(zBuf,"<Block>");
- }
- break;
- default:
- if( p->base.type >= HtmlMarkupMap[0].type
- && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
- zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
- }else{
- zName = "Unknown";
- }
- sprintf(zBuf,"<%s",zName);
- for(j=1; j<p->base.count; j += 2){
- sprintf(&zBuf[strlen(zBuf)]," %s=%s",
- p->markup.argv[j-1],p->markup.argv[j]);
- }
- strcat(zBuf,">");
- break;
- }
- return zBuf;
-#else
- return 0;
-#endif
-}
-
-/*
-** Return all tokens between the two elements as a Tcl list.
-*/
-void HtmlTclizeList(Tcl_Interp *interp, HtmlElement *p, HtmlElement *pEnd){
- Tcl_DString str;
- int i;
- char *zName;
- char zLine[100];
-
- Tcl_DStringInit(&str);
- while( p && p!=pEnd ){
- switch( p->base.type ){
- case Html_Block:
- break;
- case Html_Text:
- Tcl_DStringStartSublist(&str);
- Tcl_DStringAppendElement(&str,"Text");
- Tcl_DStringAppendElement(&str, p->text.zText);
- Tcl_DStringEndSublist(&str);
- break;
- case Html_Space:
- sprintf(zLine,"Space %d %d",
- p->base.count, (p->base.flags & HTML_NewLine)!=0);
- Tcl_DStringAppendElement(&str,zLine);
- break;
- case Html_Unknown:
- Tcl_DStringAppendElement(&str,"Unknown");
- break;
- default:
- Tcl_DStringStartSublist(&str);
- Tcl_DStringAppendElement(&str,"Markup");
- if( p->base.type >= HtmlMarkupMap[0].type
- && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
- zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
- }else{
- zName = "Unknown";
- }
- Tcl_DStringAppendElement(&str, zName);
- for(i=0; i<p->base.count; i++){
- Tcl_DStringAppendElement(&str, p->markup.argv[i]);
- }
- Tcl_DStringEndSublist(&str);
- break;
- }
- p = p->pNext;
- }
- Tcl_DStringResult(interp, &str);
-}
-
-/*
-** Print a list of tokens
-*/
-#ifdef DEBUG
-void HtmlPrintList(HtmlElement *p, HtmlElement *pEnd){
- while( p && p!=pEnd ){
- if( p->base.type==Html_Block ){
- char *z = p->block.z;
- int n = p->block.n;
- if( n==0 || z==0 ){
- n = 1;
- z = "";
- }
- printf("Block 0x%08x flags=%02x cnt=%d x=%d..%d y=%d..%d z=\"%.*s\"\n",
- (int)p, p->base.flags, p->base.count, p->block.left, p->block.right,
- p->block.top, p->block.bottom, n, z);
- }else{
- printf("Token 0x%08x font=%2d color=%2d align=%d flags=0x%04x name=%s\n",
- (int)p, p->base.style.font, p->base.style.color,
- p->base.style.align, p->base.style.flags, HtmlTokenName(p));
- }
- p = p->pNext;
- }
-}
-#endif