/*
** A tokenizer that converts raw HTML into a linked list of HTML elements.
**
** Copyright (C) 1997-2000 D. Richard Hipp
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Library General Public
** License as published by the Free Software Foundation; either
** version 2 of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Library General Public License for more details.
**
** You should have received a copy of the GNU Library General Public
** License along with this library; if not, write to the
** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
** Boston, MA 02111-1307, USA.
**
** Author contact information:
** drh@acm.org
** http://www.hwaci.com/drh/
*/
#include
#include
#include
#include
#include
#include "htmlparse.h"
/****************** Begin Escape Sequence Translator *************/
/*
** The next section of code implements routines used to translate
** the '&' escape sequences of SGML to individual characters.
** Examples:
**
** & &
** < <
** > >
** nonbreakable space
*/
/* Each escape sequence is recorded as an instance of the following
** structure
*/
struct sgEsc {
char *zName; /* The name of this escape sequence. ex: "amp" */
char value[8]; /* The value for this sequence. ex: "&" */
struct sgEsc *pNext; /* Next sequence with the same hash on zName */
};
/* The following is a table of all escape sequences. Add new sequences
** by adding entries to this table.
*/
static struct sgEsc esc_sequences[] = {
{ "quot", "\"", 0 },
{ "amp", "&", 0 },
{ "lt", "<", 0 },
{ "gt", ">", 0 },
{ "nbsp", " ", 0 },
{ "iexcl", "\241", 0 },
{ "cent", "\242", 0 },
{ "pound", "\243", 0 },
{ "curren", "\244", 0 },
{ "yen", "\245", 0 },
{ "brvbar", "\246", 0 },
{ "sect", "\247", 0 },
{ "uml", "\250", 0 },
{ "copy", "\251", 0 },
{ "ordf", "\252", 0 },
{ "laquo", "\253", 0 },
{ "not", "\254", 0 },
{ "shy", "\255", 0 },
{ "reg", "\256", 0 },
{ "macr", "\257", 0 },
{ "deg", "\260", 0 },
{ "plusmn", "\261", 0 },
{ "sup2", "\262", 0 },
{ "sup3", "\263", 0 },
{ "acute", "\264", 0 },
{ "micro", "\265", 0 },
{ "para", "\266", 0 },
{ "middot", "\267", 0 },
{ "cedil", "\270", 0 },
{ "sup1", "\271", 0 },
{ "ordm", "\272", 0 },
{ "raquo", "\273", 0 },
{ "frac14", "\274", 0 },
{ "frac12", "\275", 0 },
{ "frac34", "\276", 0 },
{ "iquest", "\277", 0 },
{ "Agrave", "\300", 0 },
{ "Aacute", "\301", 0 },
{ "Acirc", "\302", 0 },
{ "Atilde", "\303", 0 },
{ "Auml", "\304", 0 },
{ "Aring", "\305", 0 },
{ "AElig", "\306", 0 },
{ "Ccedil", "\307", 0 },
{ "Egrave", "\310", 0 },
{ "Eacute", "\311", 0 },
{ "Ecirc", "\312", 0 },
{ "Euml", "\313", 0 },
{ "Igrave", "\314", 0 },
{ "Iacute", "\315", 0 },
{ "Icirc", "\316", 0 },
{ "Iuml", "\317", 0 },
{ "ETH", "\320", 0 },
{ "Ntilde", "\321", 0 },
{ "Ograve", "\322", 0 },
{ "Oacute", "\323", 0 },
{ "Ocirc", "\324", 0 },
{ "Otilde", "\325", 0 },
{ "Ouml", "\326", 0 },
{ "times", "\327", 0 },
{ "Oslash", "\330", 0 },
{ "Ugrave", "\331", 0 },
{ "Uacute", "\332", 0 },
{ "Ucirc", "\333", 0 },
{ "Uuml", "\334", 0 },
{ "Yacute", "\335", 0 },
{ "THORN", "\336", 0 },
{ "szlig", "\337", 0 },
{ "agrave", "\340", 0 },
{ "aacute", "\341", 0 },
{ "acirc", "\342", 0 },
{ "atilde", "\343", 0 },
{ "auml", "\344", 0 },
{ "aring", "\345", 0 },
{ "aelig", "\346", 0 },
{ "ccedil", "\347", 0 },
{ "egrave", "\350", 0 },
{ "eacute", "\351", 0 },
{ "ecirc", "\352", 0 },
{ "euml", "\353", 0 },
{ "igrave", "\354", 0 },
{ "iacute", "\355", 0 },
{ "icirc", "\356", 0 },
{ "iuml", "\357", 0 },
{ "eth", "\360", 0 },
{ "ntilde", "\361", 0 },
{ "ograve", "\362", 0 },
{ "oacute", "\363", 0 },
{ "ocirc", "\364", 0 },
{ "otilde", "\365", 0 },
{ "ouml", "\366", 0 },
{ "divide", "\367", 0 },
{ "oslash", "\370", 0 },
{ "ugrave", "\371", 0 },
{ "uacute", "\372", 0 },
{ "ucirc", "\373", 0 },
{ "uuml", "\374", 0 },
{ "yacute", "\375", 0 },
{ "thorn", "\376", 0 },
{ "yuml", "\377", 0 },
};
/* The size of the handler hash table. For best results this should
** be a prime number which is about the same size as the number of
** escape sequences known to the system. */
#define ESC_HASH_SIZE (sizeof(esc_sequences)/sizeof(esc_sequences[0])+7)
/* The hash table
**
** If the name of an escape sequences hashes to the value H, then
** apEscHash[H] will point to a linked list of Esc structures, one of
** which will be the Esc structure for that escape sequence.
*/
static struct sgEsc *apEscHash[ESC_HASH_SIZE];
/* Hash a escape sequence name. The value returned is an integer
** between 0 and ESC_HASH_SIZE-1, inclusive.
*/
static int EscHash(const char *zName){
int h = 0; /* The hash value to be returned */
char c; /* The next character in the name being hashed */
while( (c=*zName)!=0 ){
h = h<<5 ^ h ^ c;
zName++;
TestPoint(0);
}
if( h<0 ){
h = -h;
TestPoint(0);
}else{
TestPoint(0);
}
return h % ESC_HASH_SIZE;
}
#ifdef TEST
/*
** Compute the longest and average collision chain length for the
** escape sequence hash table
*/
static void EscHashStats(void){
int i;
int sum = 0;
int max = 0;
int cnt;
int notempty = 0;
struct sgEsc *p;
for(i=0; ipNext;
}
sum += cnt;
if( cnt>max ) max = cnt;
}
printf("Longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
max,(double)sum/(double)notempty, i, i-notempty,
100.0*(i-notempty)/(double)i);
}
#endif
/* Initialize the escape sequence hash table
*/
static void EscInit(void){
int i; /* For looping thru the list of escape sequences */
int h; /* The hash on a sequence */
for(i=0; i',
/* 0x9c */ 'o',
/* 0x9d */ ' ',
/* 0x9e */ 'z',
/* 0x9f */ 'Y',
};
#endif
/* Translate escape sequences in the string "z". "z" is overwritten
** with the translated sequence.
**
** Unrecognized escape sequences are unaltered.
**
** Example:
**
** input = "AT&T > MCI"
** output = "AT&T > MCI"
*/
LOCAL void HtmlTranslateEscapes(char *z){
int from; /* Read characters from this position in z[] */
int to; /* Write characters into this position in z[] */
int h; /* A hash on the escape sequence */
struct sgEsc *p; /* For looping down the escape sequence collision chain */
static int isInit = 0; /* True after initialization */
from = to = 0;
if( !isInit ){
EscInit();
isInit = 1;
}
while( z[from] ){
if( z[from]=='&' ){
if( z[from+1]=='#' ){
int i = from + 2;
int v = 0;
while( isdigit(z[i]) ){
v = v*10 + z[i] - '0';
i++;
}
if( z[i]==';' ){ i++; }
/* On Unix systems, translate the non-standard microsoft
** characters in the range of 0x80 to 0x9f into something
** we can see.
*/
#ifndef __WIN32__
if( v>=0x80 && v<0xa0 ){
v = acMsChar[v&0x1f];
}
#endif
/* Put the character in the output stream in place of
** the "". How we do this depends on whether or
** not we are using UTF-8.
*/
#ifdef TCL_UTF_MAX
{
int j, n;
char value[8];
n = Tcl_UniCharToUtf(v,value);
for(j=0; jzName,&z[from+1])!=0 ){
p = p->pNext;
}
z[i] = c;
if( p ){
int j;
for(j=0; p->value[j]; j++){
z[to++] = p->value[j];
}
from = i;
if( c==';' ){
from++;
}
}else{
z[to++] = z[from++];
}
}
/* On UNIX systems, look for the non-standard microsoft characters
** between 0x80 and 0x9f and translate them into printable ASCII
** codes. Separate algorithms are required to do this for plain
** ascii and for utf-8.
*/
#ifndef __WIN32__
#ifdef TCL_UTF_MAX
}else if( (z[from]&0x80)!=0 ){
Tcl_UniChar c;
int n;
n = Tcl_UtfToUniChar(&z[from], &c);
if( c>=0x80 && c<0xa0 ){
z[to++] = acMsChar[c & 0x1f];
from += n;
}else{
while( n-- ) z[to++] = z[from++];
}
#else /* if !defined(TCL_UTF_MAX) */
}else if( ((unsigned char)z[from])>=0x80 && ((unsigned char)z[from])<0xa0 ){
z[to++] = acMsChar[z[from++]&0x1f];
#endif /* TCL_UTF_MAX */
#endif /* __WIN32__ */
}else{
z[to++] = z[from++];
TestPoint(0);
}
}
z[to] = 0;
}
/******************* End Escape Sequence Translator ***************/
/******************* Begin HTML tokenizer code *******************/
/*
** The following variable becomes TRUE when the markup hash table
** (stored in HtmlMarkupMap[]) is initialized.
*/
static int isInit = 0;
/* The hash table for HTML markup names.
**
** If an HTML markup name hashes to H, then apMap[H] will point to
** a linked list of sgMap structure, one of which will describe the
** the particular markup (if it exists.)
*/
static HtmlTokenMap *apMap[HTML_MARKUP_HASH_SIZE];
/* Hash a markup name
**
** HTML markup is case insensitive, so this function will give the
** same hash regardless of the case of the markup name.
**
** The value returned is an integer between 0 and HTML_MARKUP_HASH_SIZE-1,
** inclusive.
*/
static int HtmlHash(const char *zName){
int h = 0;
char c;
while( (c=*zName)!=0 ){
if( isupper(c) ){
c = tolower(c);
}
h = h<<5 ^ h ^ c;
zName++;
}
if( h<0 ){
h = -h;
}
return h % HTML_MARKUP_HASH_SIZE;
}
#ifdef TEST
/*
** Compute the longest and average collision chain length for the
** markup hash table
*/
static void HtmlHashStats(void){
int i;
int sum = 0;
int max = 0;
int cnt;
int notempty = 0;
struct sgMap *p;
for(i=0; ipCollide;
}
sum += cnt;
if( cnt>max ) max = cnt;
}
printf("longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n",
max, (double)sum/(double)notempty, i, i-notempty,
100.0*(i-notempty)/(double)i);
}
#endif
/* Initialize the escape sequence hash table
*/
static void HtmlHashInit(void){
int i; /* For looping thru the list of markup names */
int h; /* The hash on a markup name */
for(i=0; ibase.pNext = 0;
pElem->base.pPrev = p->pLast;
if( p->pFirst==0 ){
p->pFirst = pElem;
}else{
p->pLast->base.pNext = pElem;
}
p->pLast = pElem;
p->nToken++;
}
/*
** Compute the new column index following the given character.
*/
static int NextColumn(int iCol, char c){
switch( c ){
case '\n': return 0;
case '\t': return (iCol | 7) + 1;
default: return iCol+1;
}
/* NOT REACHED */
}
/*
** Convert a string to all lower-case letters.
*/
static void ToLower(char *z){
while( *z ){
if( isupper(*z) ) *z = tolower(*z);
z++;
}
}
/* Process as much of the input HTML as possible. Construct new
** HtmlElement structures and appended them to the list. Return
** the number of characters actually processed.
**
** This routine may invoke a callback procedure which could delete
** the HTML widget.
**
** This routine is not reentrant for the same HTML widget. To
** prevent reentrancy (during a callback), the p->iCol field is
** set to a negative number. This is a flag to future invocations
** not to reentry this routine. The p->iCol field is restored
** before exiting, of course.
*/
static int Tokenize(
HtmlWidget *p /* The HTML widget doing the parsing */
){
char *z; /* The input HTML text */
int c; /* The next character of input */
int n; /* Number of characters processed so far */
int iCol; /* Column of input */
int i, j; /* Loop counters */
int h; /* Result from HtmlHash() */
int nByte; /* Space allocated for a single HtmlElement */
HtmlElement *pElem; /* A new HTML element */
int selfClose; /* True for content free elements. Ex:
*/
int argc; /* The number of arguments on a markup */
HtmlTokenMap *pMap; /* For searching the markup name hash table */
char *zBuf; /* For handing out buffer space */
# define mxARG 200 /* Maximum number of parameters in a single markup */
char *argv[mxARG]; /* Pointers to each markup argument. */
int arglen[mxARG]; /* Length of each markup argument */
iCol = p->iCol;
n = p->nComplete;
z = p->zText;
if( iCol<0 ){ TestPoint(0); return n; } /* Prevents recursion */
p->iCol = -1;
while( (c=z[n])!=0 ){
if( p->pScript ){
/* We are in the middle of . Just look for
** the markup. (later:) Treat the
** same way. */
HtmlScript *pScript = p->pScript;
char *zEnd;
int nEnd;
if( pScript->markup.base.type==Html_SCRIPT ){
zEnd = "";
nEnd = 9;
}else{
zEnd = "";
nEnd = 8;
}
if( pScript->zScript==0 ){
pScript->zScript = &z[n];
pScript->nScript = 0;
}
for(i=n+pScript->nScript; z[i]; i++){
if( z[i]=='<' && z[i+1]=='/' && strnicmp(&z[i],zEnd,nEnd)==0 ){
pScript->nScript = i - n;
p->pScript = 0;
n = i+nEnd;
break;
}
}
if( p->pScript ){
pScript->nScript = i - n;
}
continue;
}else if( isspace(c) ){
/* White space */
for(i=0; (c=z[n+i])!=0 && isspace(c) && c!='\n' && c!='\r'; i++){}
if( c=='\r' && z[n+i+1]=='\n' ){ i++; }
pElem = HtmlAlloc( sizeof(HtmlSpaceElement) );
if( pElem==0 ){ goto incomplete; }
pElem->base.type = Html_Space;
if( c=='\n' || c=='\r' ){
pElem->base.flags = HTML_NewLine;
pElem->base.count = 1;
i++;
iCol = 0;
TestPoint(0);
}else{
int iColStart = iCol;
pElem->base.flags = 0;
for(j=0; jbase.count = iCol - iColStart;
}
AppendElement(p,pElem);
n += i;
}else if( c!='<' || p->iPlaintext!=0 ||
(!isalpha(z[n+1]) && z[n+1]!='/' && z[n+1]!='!' && z[n+1]!='?') ){
/* Ordinary text */
for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='<'; i++){}
if( c==0 ){ TestPoint(0); goto incomplete; }
if( p->iPlaintext!=0 && z[n]=='<' ){
switch( p->iPlaintext ){
case Html_LISTING:
if( i>=10 && strnicmp(&z[n],"",10)==0 ){
p->iPlaintext = 0;
goto doMarkup;
}
break;
case Html_XMP:
if( i>=6 && strnicmp(&z[n],"",6)==0 ){
p->iPlaintext = 0;
goto doMarkup;
}
break;
case Html_TEXTAREA:
if( i>=11 && strnicmp(&z[n],"",11)==0 ){
p->iPlaintext = 0;
goto doMarkup;
}
break;
default:
break;
}
}
nByte = sizeof(HtmlTextElement) + i;
pElem = HtmlAlloc( nByte );
if( pElem==0 ){ goto incomplete; }
memset(pElem,0,nByte);
pElem->base.type = Html_Text;
sprintf(pElem->text.zText,"%.*s",i,&z[n]);
AppendElement(p,pElem);
if( p->iPlaintext==0 || p->iPlaintext==Html_TEXTAREA ){
HtmlTranslateEscapes(pElem->text.zText);
}
pElem->base.count = strlen(pElem->text.zText);
n += i;
iCol += i;
}else if( strncmp(&z[n],"",3)==0 ){ break; }
}
if( z[n+i]==0 ){ TestPoint(0); goto incomplete; }
for(j=0; j' && (i<2 || c!='/'); i++){}
arglen[0] = i - 1;
if( c==0 ){ goto incomplete; }
/*
** Now parse up the arguments
*/
while( isspace(z[n+i]) ){ i++; }
while( (c=z[n+i])!=0 && c!='>' && (c!='/' || z[n+i+1]!='>') ){
if( argc>mxARG-3 ){
argc = mxARG-3;
}
argv[argc] = &z[n+i];
j = 0;
while( (c=z[n+i+j])!=0 && !isspace(c) && c!='>'
&& c!='=' && (c!='/' || z[n+i+j+1]!='>') ){
j++;
}
arglen[argc] = j;
if( c==0 ){ goto incomplete; }
i += j;
while( isspace(c) ){
i++;
c = z[n+i];
}
if( c==0 ){ goto incomplete; }
argc++;
if( c!='=' ){
argv[argc] = "";
arglen[argc] = 0;
argc++;
continue;
}
i++;
c = z[n+i];
while( isspace(c) ){
i++;
c = z[n+i];
}
if( c==0 ){ goto incomplete; }
if( c=='\'' || c=='"' ){
int cQuote = c;
i++;
argv[argc] = &z[n+i];
for(j=0; (c=z[n+i+j])!=0 && c!=cQuote; j++){}
if( c==0 ){ goto incomplete; }
arglen[argc] = j;
i += j+1;
TestPoint(0);
}else{
argv[argc] = &z[n+i];
for(j=0; (c=z[n+i+j])!=0 && !isspace(c) && c!='>'; j++){}
if( c==0 ){ goto incomplete; }
arglen[argc] = j;
i += j;
}
argc++;
while( isspace(z[n+i]) ){ i++; }
}
if( c=='/' ){
i++;
c = z[n+i];
selfClose = 1;
}else{
selfClose = 0;
}
if( c==0 ){ goto incomplete; }
for(j=0; jpCollide){
if( stricmp(pMap->zName,argv[0])==0 ){ break; }
TestPoint(0);
}
argv[0][arglen[0]] = c;
if( pMap==0 ){ continue; } /* Ignore unknown markup */
makeMarkupEntry:
/* Construct a HtmlMarkup entry for this markup.
*/
if( pMap->extra ){
nByte = pMap->extra;
}else if( argc==1 ){
nByte = sizeof(HtmlBaseElement);
}else{
nByte = sizeof(HtmlMarkupElement);
}
if( argc>1 ){
nByte += sizeof(char*) * argc;
for(j=1; jbase.type = pMap->type;
pElem->base.count = argc - 1;
if( argc>1 ){
if( pMap->extra ){
pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
}else{
pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
}
zBuf = (char*)&pElem->markup.argv[argc];
for(j=1; jmarkup.argv[j-1] = zBuf;
zBuf += arglen[j] + 1;
sprintf(pElem->markup.argv[j-1],"%.*s",arglen[j],argv[j]);
HtmlTranslateEscapes(pElem->markup.argv[j-1]);
if( (j&1)==1 ){
ToLower(pElem->markup.argv[j-1]);
}
}
pElem->markup.argv[argc-1] = 0;
}
/* The new markup has now be constructed in pElem. But before
** appending to the list, check to see if there is a special
** handler for this markup type.
*/
if( p->zHandler[pMap->type] ){
Tcl_DString str;
Tcl_DStringInit(&str);
Tcl_DStringAppend(&str, p->zHandler[pMap->type], -1);
Tcl_DStringAppendElement(&str, pMap->zName);
Tcl_DStringStartSublist(&str);
for(j=0; jmarkup.argv[j]);
}
Tcl_DStringEndSublist(&str);
HtmlFree(pElem);
HtmlLock(p);
Tcl_GlobalEval(p->interp, Tcl_DStringValue(&str));
Tcl_DStringFree(&str);
if( HtmlUnlock(p) ){ return 0; }
/* Tricky, tricky. The callback might have caused the p->zText
** pointer to change, so renew our copy of that pointer. The
** callback might also have cleared or destroyed the widget.
** If so, abort this routine.
*/
z = p->zText;
if( z==0 || p->tkwin==0 ){
n = 0;
iCol = 0;
goto incomplete;
}
continue;
}
/* No special handler for this markup. Just append it to the
** list of all tokens.
*/
AppendElement(p,pElem);
switch( pMap->type ){
case Html_PLAINTEXT:
case Html_LISTING:
case Html_XMP:
case Html_TEXTAREA:
p->iPlaintext = pMap->type;
break;
case Html_STYLE:
case Html_SCRIPT:
p->pScript = (HtmlScript*)pElem;
break;
default:
break;
}
/* If this is self-closing markup (ex:
or ) then
** synthesize a closing token.
*/
if( selfClose && argv[0][0]!='/'
&& strcmp(&pMap[1].zName[1],pMap->zName)==0 ){
selfClose = 0;
pMap++;
argc = 1;
goto makeMarkupEntry;
}
}
}
incomplete:
p->iCol = iCol;
return n;
}
/************************** End HTML Tokenizer Code ***************************/
/*
** Append text to the tokenizer engine.
**
** This routine (actually the Tokenize() subroutine that is called
** by this routine) may invoke a callback procedure which could delete
** the HTML widget.
*/
void HtmlTokenizerAppend(HtmlWidget *htmlPtr, const char *zText){
int len = strlen(zText);
if( htmlPtr->nText==0 ){
htmlPtr->nAlloc = len + 100;
htmlPtr->zText = HtmlAlloc( htmlPtr->nAlloc );
TestPoint(0);
}else if( htmlPtr->nText + len >= htmlPtr->nAlloc ){
htmlPtr->nAlloc += len + 100;
htmlPtr->zText = HtmlRealloc( htmlPtr->zText, htmlPtr->nAlloc );
TestPoint(0);
}
if( htmlPtr->zText==0 ){
htmlPtr->nText = 0;
UNTESTED;
return;
}
strcpy(&htmlPtr->zText[htmlPtr->nText], zText);
htmlPtr->nText += len;
htmlPtr->nComplete = Tokenize(htmlPtr);
}
/*
** This routine takes a text representation of a token, converts
** it into an HtmlElement structure and inserts it immediately
** prior to pToken. If pToken==0, then the newly created HtmlElement
** is appended.
**
** This routine does nothing to resize, restyle, relayout or redisplay
** the HTML. That is the calling routines responsibility.
**
** Return 0 if successful. Return non-zero if zType is not a known
** markup name.
*/
int HtmlInsertToken(
HtmlWidget *htmlPtr, /* The widget into which the token is inserted */
HtmlElement *pToken, /* Insert before this. Append if pToken==0 */
char *zType, /* Type of markup. Ex: "/a" or "table" */
char *zArgs /* List of arguments */
){
HtmlTokenMap *pMap; /* For searching the markup name hash table */
int h; /* The hash on zType */
HtmlElement *pElem; /* The new element */
int nByte; /* How many bytes to allocate */
int i; /* Loop counter */
if( !isInit ){
HtmlHashInit();
isInit = 1;
TestPoint(0);
}else{
TestPoint(0);
}
h = HtmlHash(zType);
for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
TestPoint(0);
}
if( pMap==0 ){ TestPoint(0); return 1; }
if( zArgs==0 || *zArgs==0 ){
/* Special case of no arguments. This is a lot easier... */
nByte = pMap->extra ? pMap->extra : sizeof(HtmlBaseElement);
nByte += strlen(zType);
pElem = HtmlAlloc( nByte );
if( pElem==0 ){ TestPoint(0); return 1; }
memset(pElem,0,nByte);
pElem->base.type = pMap->type;
TestPoint(0);
}else{
/* The general case. There are arguments that need to be parsed
** up. This is slower, but we gotta do it.
*/
int argc;
const char **argv;
char *zBuf;
if( Tcl_SplitList(htmlPtr->interp, zArgs, &argc, (const char***)&argv)!=TCL_OK ){
TestPoint(0);
return 1;
}
if( pMap->extra ){
nByte = pMap->extra;
TestPoint(0);
}else{
nByte = sizeof(HtmlMarkupElement);
TestPoint(0);
}
nByte += sizeof(char*)*(argc+1) + strlen(zArgs) + argc + 2;
pElem = HtmlAlloc( nByte );
if( pElem==0 ){
HtmlFree(argv);
TestPoint(0);
return 1;
}
memset(pElem,0,nByte);
pElem->base.type = pMap->type;
pElem->base.count = argc;
if( pMap->extra ){
pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
TestPoint(0);
}else{
pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
TestPoint(0);
}
zBuf = (char*)&pElem->markup.argv[argc];
for(i=1; imarkup.argv[i-1] = zBuf;
zBuf += strlen(argv[i]) + 1;
strcpy(pElem->markup.argv[i-1],argv[i]);
TestPoint(0);
}
pElem->markup.argv[argc-1] = 0;
HtmlFree(argv);
TestPoint(0);
}
if( pToken ){
pElem->base.pNext = pToken;
pElem->base.pPrev = pToken->base.pPrev;
if( pToken->base.pPrev ){
pToken->base.pPrev->pNext = pElem;
TestPoint(0);
}else{
htmlPtr->pFirst = pElem;
TestPoint(0);
}
pToken->base.pPrev = pElem;
htmlPtr->nToken++;
}else{
AppendElement(htmlPtr,pElem);
TestPoint(0);
}
return 0;
}
/*
** Convert a markup name into a type integer
*/
int HtmlNameToType(const char *zType){
HtmlTokenMap *pMap; /* For searching the markup name hash table */
int h; /* The hash on zType */
if( !isInit ){
HtmlHashInit();
isInit = 1;
TestPoint(0);
}else{
TestPoint(0);
}
h = HtmlHash(zType);
for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
TestPoint(0);
}
return pMap ? pMap->type : Html_Unknown;
}
/*
** Convert a type into a symbolic name
*/
const char *HtmlTypeToName(int type){
if( type>=Html_A && type<=Html_EndXMP ){
HtmlTokenMap *pMap = apMap[type - Html_A];
TestPoint(0);
return pMap->zName;
}else{
TestPoint(0);
return "???";
}
}
/*
** For debugging purposes, print information about a token
*/
char *HtmlTokenName(HtmlElement *p){
#ifdef DEBUG
static char zBuf[200];
int j;
char *zName;
if( p==0 ) return "NULL";
switch( p->base.type ){
case Html_Text:
sprintf(zBuf,"\"%.*s\"",p->base.count,p->text.zText);
break;
case Html_Space:
if( p->base.flags & HTML_NewLine ){
sprintf(zBuf,"\"\\n\"");
}else{
sprintf(zBuf,"\" \"");
}
break;
case Html_Block:
if( p->block.n>0 ){
int n = p->block.n;
if( n>150 ) n = 150;
sprintf(zBuf,"", n, p->block.z);
}else{
sprintf(zBuf,"");
}
break;
default:
if( p->base.type >= HtmlMarkupMap[0].type
&& p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
}else{
zName = "Unknown";
}
sprintf(zBuf,"<%s",zName);
for(j=1; jbase.count; j += 2){
sprintf(&zBuf[strlen(zBuf)]," %s=%s",
p->markup.argv[j-1],p->markup.argv[j]);
}
strcat(zBuf,">");
break;
}
return zBuf;
#else
return 0;
#endif
}
/*
** Return all tokens between the two elements as a Tcl list.
*/
void HtmlTclizeList(Tcl_Interp *interp, HtmlElement *p, HtmlElement *pEnd){
Tcl_DString str;
int i;
char *zName;
char zLine[100];
Tcl_DStringInit(&str);
while( p && p!=pEnd ){
switch( p->base.type ){
case Html_Block:
break;
case Html_Text:
Tcl_DStringStartSublist(&str);
Tcl_DStringAppendElement(&str,"Text");
Tcl_DStringAppendElement(&str, p->text.zText);
Tcl_DStringEndSublist(&str);
break;
case Html_Space:
sprintf(zLine,"Space %d %d",
p->base.count, (p->base.flags & HTML_NewLine)!=0);
Tcl_DStringAppendElement(&str,zLine);
break;
case Html_Unknown:
Tcl_DStringAppendElement(&str,"Unknown");
break;
default:
Tcl_DStringStartSublist(&str);
Tcl_DStringAppendElement(&str,"Markup");
if( p->base.type >= HtmlMarkupMap[0].type
&& p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
}else{
zName = "Unknown";
}
Tcl_DStringAppendElement(&str, zName);
for(i=0; ibase.count; i++){
Tcl_DStringAppendElement(&str, p->markup.argv[i]);
}
Tcl_DStringEndSublist(&str);
break;
}
p = p->pNext;
}
Tcl_DStringResult(interp, &str);
}
/*
** Print a list of tokens
*/
#ifdef DEBUG
void HtmlPrintList(HtmlElement *p, HtmlElement *pEnd){
while( p && p!=pEnd ){
if( p->base.type==Html_Block ){
char *z = p->block.z;
int n = p->block.n;
if( n==0 || z==0 ){
n = 1;
z = "";
}
printf("Block 0x%08x flags=%02x cnt=%d x=%d..%d y=%d..%d z=\"%.*s\"\n",
(int)p, p->base.flags, p->base.count, p->block.left, p->block.right,
p->block.top, p->block.bottom, n, z);
}else{
printf("Token 0x%08x font=%2d color=%2d align=%d flags=0x%04x name=%s\n",
(int)p, p->base.style.font, p->base.style.color,
p->base.style.align, p->base.style.flags, HtmlTokenName(p));
}
p = p->pNext;
}
}
#endif