diff options
Diffstat (limited to 'tkhtml1/src/htmlparse.c')
-rw-r--r-- | tkhtml1/src/htmlparse.c | 1181 |
1 files changed, 0 insertions, 1181 deletions
diff --git a/tkhtml1/src/htmlparse.c b/tkhtml1/src/htmlparse.c deleted file mode 100644 index 4511005..0000000 --- a/tkhtml1/src/htmlparse.c +++ /dev/null @@ -1,1181 +0,0 @@ -/* -** A tokenizer that converts raw HTML into a linked list of HTML elements. -** -** Copyright (C) 1997-2000 D. Richard Hipp -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Library General Public -** License as published by the Free Software Foundation; either -** version 2 of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Library General Public License for more details. -** -** You should have received a copy of the GNU Library General Public -** License along with this library; if not, write to the -** Free Software Foundation, Inc., 59 Temple Place - Suite 330, -** Boston, MA 02111-1307, USA. -** -** Author contact information: -** drh@acm.org -** http://www.hwaci.com/drh/ -*/ -#include <string.h> -#include <stdlib.h> -#include <stdio.h> -#include <ctype.h> -#include <tk.h> -#include "htmlparse.h" - -/****************** Begin Escape Sequence Translator *************/ -/* -** The next section of code implements routines used to translate -** the '&' escape sequences of SGML to individual characters. -** Examples: -** -** & & -** < < -** > > -** nonbreakable space -*/ - -/* Each escape sequence is recorded as an instance of the following -** structure -*/ -struct sgEsc { - char *zName; /* The name of this escape sequence. ex: "amp" */ - char value[8]; /* The value for this sequence. ex: "&" */ - struct sgEsc *pNext; /* Next sequence with the same hash on zName */ -}; - -/* The following is a table of all escape sequences. Add new sequences -** by adding entries to this table. -*/ -static struct sgEsc esc_sequences[] = { - { "quot", "\"", 0 }, - { "amp", "&", 0 }, - { "lt", "<", 0 }, - { "gt", ">", 0 }, - { "nbsp", " ", 0 }, - { "iexcl", "\241", 0 }, - { "cent", "\242", 0 }, - { "pound", "\243", 0 }, - { "curren", "\244", 0 }, - { "yen", "\245", 0 }, - { "brvbar", "\246", 0 }, - { "sect", "\247", 0 }, - { "uml", "\250", 0 }, - { "copy", "\251", 0 }, - { "ordf", "\252", 0 }, - { "laquo", "\253", 0 }, - { "not", "\254", 0 }, - { "shy", "\255", 0 }, - { "reg", "\256", 0 }, - { "macr", "\257", 0 }, - { "deg", "\260", 0 }, - { "plusmn", "\261", 0 }, - { "sup2", "\262", 0 }, - { "sup3", "\263", 0 }, - { "acute", "\264", 0 }, - { "micro", "\265", 0 }, - { "para", "\266", 0 }, - { "middot", "\267", 0 }, - { "cedil", "\270", 0 }, - { "sup1", "\271", 0 }, - { "ordm", "\272", 0 }, - { "raquo", "\273", 0 }, - { "frac14", "\274", 0 }, - { "frac12", "\275", 0 }, - { "frac34", "\276", 0 }, - { "iquest", "\277", 0 }, - { "Agrave", "\300", 0 }, - { "Aacute", "\301", 0 }, - { "Acirc", "\302", 0 }, - { "Atilde", "\303", 0 }, - { "Auml", "\304", 0 }, - { "Aring", "\305", 0 }, - { "AElig", "\306", 0 }, - { "Ccedil", "\307", 0 }, - { "Egrave", "\310", 0 }, - { "Eacute", "\311", 0 }, - { "Ecirc", "\312", 0 }, - { "Euml", "\313", 0 }, - { "Igrave", "\314", 0 }, - { "Iacute", "\315", 0 }, - { "Icirc", "\316", 0 }, - { "Iuml", "\317", 0 }, - { "ETH", "\320", 0 }, - { "Ntilde", "\321", 0 }, - { "Ograve", "\322", 0 }, - { "Oacute", "\323", 0 }, - { "Ocirc", "\324", 0 }, - { "Otilde", "\325", 0 }, - { "Ouml", "\326", 0 }, - { "times", "\327", 0 }, - { "Oslash", "\330", 0 }, - { "Ugrave", "\331", 0 }, - { "Uacute", "\332", 0 }, - { "Ucirc", "\333", 0 }, - { "Uuml", "\334", 0 }, - { "Yacute", "\335", 0 }, - { "THORN", "\336", 0 }, - { "szlig", "\337", 0 }, - { "agrave", "\340", 0 }, - { "aacute", "\341", 0 }, - { "acirc", "\342", 0 }, - { "atilde", "\343", 0 }, - { "auml", "\344", 0 }, - { "aring", "\345", 0 }, - { "aelig", "\346", 0 }, - { "ccedil", "\347", 0 }, - { "egrave", "\350", 0 }, - { "eacute", "\351", 0 }, - { "ecirc", "\352", 0 }, - { "euml", "\353", 0 }, - { "igrave", "\354", 0 }, - { "iacute", "\355", 0 }, - { "icirc", "\356", 0 }, - { "iuml", "\357", 0 }, - { "eth", "\360", 0 }, - { "ntilde", "\361", 0 }, - { "ograve", "\362", 0 }, - { "oacute", "\363", 0 }, - { "ocirc", "\364", 0 }, - { "otilde", "\365", 0 }, - { "ouml", "\366", 0 }, - { "divide", "\367", 0 }, - { "oslash", "\370", 0 }, - { "ugrave", "\371", 0 }, - { "uacute", "\372", 0 }, - { "ucirc", "\373", 0 }, - { "uuml", "\374", 0 }, - { "yacute", "\375", 0 }, - { "thorn", "\376", 0 }, - { "yuml", "\377", 0 }, -}; - -/* The size of the handler hash table. For best results this should -** be a prime number which is about the same size as the number of -** escape sequences known to the system. */ -#define ESC_HASH_SIZE (sizeof(esc_sequences)/sizeof(esc_sequences[0])+7) - -/* The hash table -** -** If the name of an escape sequences hashes to the value H, then -** apEscHash[H] will point to a linked list of Esc structures, one of -** which will be the Esc structure for that escape sequence. -*/ -static struct sgEsc *apEscHash[ESC_HASH_SIZE]; - -/* Hash a escape sequence name. The value returned is an integer -** between 0 and ESC_HASH_SIZE-1, inclusive. -*/ -static int EscHash(const char *zName){ - int h = 0; /* The hash value to be returned */ - char c; /* The next character in the name being hashed */ - - while( (c=*zName)!=0 ){ - h = h<<5 ^ h ^ c; - zName++; - TestPoint(0); - } - if( h<0 ){ - h = -h; - TestPoint(0); - }else{ - TestPoint(0); - } - return h % ESC_HASH_SIZE; -} - -#ifdef TEST -/* -** Compute the longest and average collision chain length for the -** escape sequence hash table -*/ -static void EscHashStats(void){ - int i; - int sum = 0; - int max = 0; - int cnt; - int notempty = 0; - struct sgEsc *p; - - for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[0]); i++){ - cnt = 0; - p = apEscHash[i]; - if( p ) notempty++; - while( p ){ - cnt++; - p = p->pNext; - } - sum += cnt; - if( cnt>max ) max = cnt; - } - printf("Longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n", - max,(double)sum/(double)notempty, i, i-notempty, - 100.0*(i-notempty)/(double)i); -} -#endif - -/* Initialize the escape sequence hash table -*/ -static void EscInit(void){ - int i; /* For looping thru the list of escape sequences */ - int h; /* The hash on a sequence */ - - for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[i]); i++){ -/* #ifdef TCL_UTF_MAX */ -#if 0 - { - int c = esc_sequences[i].value[0]; - Tcl_UniCharToUtf(c, esc_sequences[i].value); - } -#endif - h = EscHash(esc_sequences[i].zName); - esc_sequences[i].pNext = apEscHash[h]; - apEscHash[h] = &esc_sequences[i]; - TestPoint(0); - } -#ifdef TEST - EscHashStats(); -#endif -} - -/* -** This table translates the non-standard microsoft characters between -** 0x80 and 0x9f into plain ASCII so that the characters will be visible -** on Unix systems. Care is taken to translate the characters -** into values less than 0x80, to avoid UTF-8 problems. -*/ -#ifndef __WIN32__ -static char acMsChar[] = { - /* 0x80 */ 'C', - /* 0x81 */ ' ', - /* 0x82 */ ',', - /* 0x83 */ 'f', - /* 0x84 */ '"', - /* 0x85 */ '.', - /* 0x86 */ '*', - /* 0x87 */ '*', - /* 0x88 */ '^', - /* 0x89 */ '%', - /* 0x8a */ 'S', - /* 0x8b */ '<', - /* 0x8c */ 'O', - /* 0x8d */ ' ', - /* 0x8e */ 'Z', - /* 0x8f */ ' ', - /* 0x90 */ ' ', - /* 0x91 */ '\'', - /* 0x92 */ '\'', - /* 0x93 */ '"', - /* 0x94 */ '"', - /* 0x95 */ '*', - /* 0x96 */ '-', - /* 0x97 */ '-', - /* 0x98 */ '~', - /* 0x99 */ '@', - /* 0x9a */ 's', - /* 0x9b */ '>', - /* 0x9c */ 'o', - /* 0x9d */ ' ', - /* 0x9e */ 'z', - /* 0x9f */ 'Y', -}; -#endif - -/* Translate escape sequences in the string "z". "z" is overwritten -** with the translated sequence. -** -** Unrecognized escape sequences are unaltered. -** -** Example: -** -** input = "AT&T > MCI" -** output = "AT&T > MCI" -*/ -LOCAL void HtmlTranslateEscapes(char *z){ - int from; /* Read characters from this position in z[] */ - int to; /* Write characters into this position in z[] */ - int h; /* A hash on the escape sequence */ - struct sgEsc *p; /* For looping down the escape sequence collision chain */ - static int isInit = 0; /* True after initialization */ - - from = to = 0; - if( !isInit ){ - EscInit(); - isInit = 1; - } - while( z[from] ){ - if( z[from]=='&' ){ - if( z[from+1]=='#' ){ - int i = from + 2; - int v = 0; - while( isdigit(z[i]) ){ - v = v*10 + z[i] - '0'; - i++; - } - if( z[i]==';' ){ i++; } - - /* On Unix systems, translate the non-standard microsoft - ** characters in the range of 0x80 to 0x9f into something - ** we can see. - */ -#ifndef __WIN32__ - if( v>=0x80 && v<0xa0 ){ - v = acMsChar[v&0x1f]; - } -#endif - /* Put the character in the output stream in place of - ** the "�". How we do this depends on whether or - ** not we are using UTF-8. - */ -#ifdef TCL_UTF_MAX - { - int j, n; - char value[8]; - n = Tcl_UniCharToUtf(v,value); - for(j=0; j<n; j++){ - z[to++] = value[j]; - } - } -#else - z[to++] = v; -#endif - from = i; - }else{ - int i = from+1; - int c; - while( z[i] && isalnum(z[i]) ){ TestPoint(0); i++; } - c = z[i]; - z[i] = 0; - h = EscHash(&z[from+1]); - p = apEscHash[h]; - while( p && strcmp(p->zName,&z[from+1])!=0 ){ - p = p->pNext; - } - z[i] = c; - if( p ){ - int j; - for(j=0; p->value[j]; j++){ - z[to++] = p->value[j]; - } - from = i; - if( c==';' ){ - from++; - } - }else{ - z[to++] = z[from++]; - } - } - - /* On UNIX systems, look for the non-standard microsoft characters - ** between 0x80 and 0x9f and translate them into printable ASCII - ** codes. Separate algorithms are required to do this for plain - ** ascii and for utf-8. - */ -#ifndef __WIN32__ -#ifdef TCL_UTF_MAX - }else if( (z[from]&0x80)!=0 ){ - Tcl_UniChar c; - int n; - n = Tcl_UtfToUniChar(&z[from], &c); - if( c>=0x80 && c<0xa0 ){ - z[to++] = acMsChar[c & 0x1f]; - from += n; - }else{ - while( n-- ) z[to++] = z[from++]; - } -#else /* if !defined(TCL_UTF_MAX) */ - }else if( ((unsigned char)z[from])>=0x80 && ((unsigned char)z[from])<0xa0 ){ - z[to++] = acMsChar[z[from++]&0x1f]; -#endif /* TCL_UTF_MAX */ -#endif /* __WIN32__ */ - }else{ - z[to++] = z[from++]; - TestPoint(0); - } - } - z[to] = 0; -} -/******************* End Escape Sequence Translator ***************/ - -/******************* Begin HTML tokenizer code *******************/ -/* -** The following variable becomes TRUE when the markup hash table -** (stored in HtmlMarkupMap[]) is initialized. -*/ -static int isInit = 0; - -/* The hash table for HTML markup names. -** -** If an HTML markup name hashes to H, then apMap[H] will point to -** a linked list of sgMap structure, one of which will describe the -** the particular markup (if it exists.) -*/ -static HtmlTokenMap *apMap[HTML_MARKUP_HASH_SIZE]; - -/* Hash a markup name -** -** HTML markup is case insensitive, so this function will give the -** same hash regardless of the case of the markup name. -** -** The value returned is an integer between 0 and HTML_MARKUP_HASH_SIZE-1, -** inclusive. -*/ -static int HtmlHash(const char *zName){ - int h = 0; - char c; - while( (c=*zName)!=0 ){ - if( isupper(c) ){ - c = tolower(c); - } - h = h<<5 ^ h ^ c; - zName++; - } - if( h<0 ){ - h = -h; - } - return h % HTML_MARKUP_HASH_SIZE; -} - -#ifdef TEST -/* -** Compute the longest and average collision chain length for the -** markup hash table -*/ -static void HtmlHashStats(void){ - int i; - int sum = 0; - int max = 0; - int cnt; - int notempty = 0; - struct sgMap *p; - - for(i=0; i<HTML_MARKUP_COUNT; i++){ - cnt = 0; - p = apMap[i]; - if( p ) notempty++; - while( p ){ - cnt++; - p = p->pCollide; - } - sum += cnt; - if( cnt>max ) max = cnt; - - } - printf("longest chain=%d avg=%g slots=%d empty=%d (%g%%)\n", - max, (double)sum/(double)notempty, i, i-notempty, - 100.0*(i-notempty)/(double)i); -} -#endif - -/* Initialize the escape sequence hash table -*/ -static void HtmlHashInit(void){ - int i; /* For looping thru the list of markup names */ - int h; /* The hash on a markup name */ - - for(i=0; i<HTML_MARKUP_COUNT; i++){ - h = HtmlHash(HtmlMarkupMap[i].zName); - HtmlMarkupMap[i].pCollide = apMap[h]; - apMap[h] = &HtmlMarkupMap[i]; - } -#ifdef TEST - HtmlHashStats(); -#endif -} - -/* -** Append the given HtmlElement to the tokenizers list of elements -*/ -static void AppendElement(HtmlWidget *p, HtmlElement *pElem){ - pElem->base.pNext = 0; - pElem->base.pPrev = p->pLast; - if( p->pFirst==0 ){ - p->pFirst = pElem; - }else{ - p->pLast->base.pNext = pElem; - } - p->pLast = pElem; - p->nToken++; -} - -/* -** Compute the new column index following the given character. -*/ -static int NextColumn(int iCol, char c){ - switch( c ){ - case '\n': return 0; - case '\t': return (iCol | 7) + 1; - default: return iCol+1; - } - /* NOT REACHED */ -} - -/* -** Convert a string to all lower-case letters. -*/ -static void ToLower(char *z){ - while( *z ){ - if( isupper(*z) ) *z = tolower(*z); - z++; - } -} - -/* Process as much of the input HTML as possible. Construct new -** HtmlElement structures and appended them to the list. Return -** the number of characters actually processed. -** -** This routine may invoke a callback procedure which could delete -** the HTML widget. -** -** This routine is not reentrant for the same HTML widget. To -** prevent reentrancy (during a callback), the p->iCol field is -** set to a negative number. This is a flag to future invocations -** not to reentry this routine. The p->iCol field is restored -** before exiting, of course. -*/ -static int Tokenize( - HtmlWidget *p /* The HTML widget doing the parsing */ -){ - char *z; /* The input HTML text */ - int c; /* The next character of input */ - int n; /* Number of characters processed so far */ - int iCol; /* Column of input */ - int i, j; /* Loop counters */ - int h; /* Result from HtmlHash() */ - int nByte; /* Space allocated for a single HtmlElement */ - HtmlElement *pElem; /* A new HTML element */ - int selfClose; /* True for content free elements. Ex: <br/> */ - int argc; /* The number of arguments on a markup */ - HtmlTokenMap *pMap; /* For searching the markup name hash table */ - char *zBuf; /* For handing out buffer space */ -# define mxARG 200 /* Maximum number of parameters in a single markup */ - char *argv[mxARG]; /* Pointers to each markup argument. */ - int arglen[mxARG]; /* Length of each markup argument */ - - iCol = p->iCol; - n = p->nComplete; - z = p->zText; - if( iCol<0 ){ TestPoint(0); return n; } /* Prevents recursion */ - p->iCol = -1; - while( (c=z[n])!=0 ){ - if( p->pScript ){ - /* We are in the middle of <SCRIPT>...</SCRIPT>. Just look for - ** the </SCRIPT> markup. (later:) Treat <STYLE>...</STYLE> the - ** same way. */ - HtmlScript *pScript = p->pScript; - char *zEnd; - int nEnd; - if( pScript->markup.base.type==Html_SCRIPT ){ - zEnd = "</script>"; - nEnd = 9; - }else{ - zEnd = "</style>"; - nEnd = 8; - } - if( pScript->zScript==0 ){ - pScript->zScript = &z[n]; - pScript->nScript = 0; - } - for(i=n+pScript->nScript; z[i]; i++){ - if( z[i]=='<' && z[i+1]=='/' && strnicmp(&z[i],zEnd,nEnd)==0 ){ - pScript->nScript = i - n; - p->pScript = 0; - n = i+nEnd; - break; - } - } - if( p->pScript ){ - pScript->nScript = i - n; - } - continue; - }else if( isspace(c) ){ - /* White space */ - for(i=0; (c=z[n+i])!=0 && isspace(c) && c!='\n' && c!='\r'; i++){} - if( c=='\r' && z[n+i+1]=='\n' ){ i++; } - pElem = HtmlAlloc( sizeof(HtmlSpaceElement) ); - if( pElem==0 ){ goto incomplete; } - pElem->base.type = Html_Space; - if( c=='\n' || c=='\r' ){ - pElem->base.flags = HTML_NewLine; - pElem->base.count = 1; - i++; - iCol = 0; - TestPoint(0); - }else{ - int iColStart = iCol; - pElem->base.flags = 0; - for(j=0; j<i; j++){ - iCol = NextColumn(iCol, z[n+j]); - TestPoint(0); - } - pElem->base.count = iCol - iColStart; - } - AppendElement(p,pElem); - n += i; - }else if( c!='<' || p->iPlaintext!=0 || - (!isalpha(z[n+1]) && z[n+1]!='/' && z[n+1]!='!' && z[n+1]!='?') ){ - /* Ordinary text */ - for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='<'; i++){} - if( c==0 ){ TestPoint(0); goto incomplete; } - if( p->iPlaintext!=0 && z[n]=='<' ){ - switch( p->iPlaintext ){ - case Html_LISTING: - if( i>=10 && strnicmp(&z[n],"</listing>",10)==0 ){ - p->iPlaintext = 0; - goto doMarkup; - } - break; - case Html_XMP: - if( i>=6 && strnicmp(&z[n],"</xmp>",6)==0 ){ - p->iPlaintext = 0; - goto doMarkup; - } - break; - case Html_TEXTAREA: - if( i>=11 && strnicmp(&z[n],"</textarea>",11)==0 ){ - p->iPlaintext = 0; - goto doMarkup; - } - break; - default: - break; - } - } - nByte = sizeof(HtmlTextElement) + i; - pElem = HtmlAlloc( nByte ); - if( pElem==0 ){ goto incomplete; } - memset(pElem,0,nByte); - pElem->base.type = Html_Text; - sprintf(pElem->text.zText,"%.*s",i,&z[n]); - AppendElement(p,pElem); - if( p->iPlaintext==0 || p->iPlaintext==Html_TEXTAREA ){ - HtmlTranslateEscapes(pElem->text.zText); - } - pElem->base.count = strlen(pElem->text.zText); - n += i; - iCol += i; - }else if( strncmp(&z[n],"<!--",4)==0 ){ - /* An HTML comment. Just skip it. */ - for(i=4; z[n+i]; i++){ - if( z[n+i]=='-' && strncmp(&z[n+i],"-->",3)==0 ){ break; } - } - if( z[n+i]==0 ){ TestPoint(0); goto incomplete; } - for(j=0; j<i+3; j++){ - iCol = NextColumn(iCol, z[n+j]); - } - n += i + 3; - }else{ - /* Markup. - ** - ** First get the name of the markup - */ -doMarkup: - argc = 1; - argv[0] = &z[n+1]; - for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='>' && (i<2 || c!='/'); i++){} - arglen[0] = i - 1; - if( c==0 ){ goto incomplete; } - - /* - ** Now parse up the arguments - */ - while( isspace(z[n+i]) ){ i++; } - while( (c=z[n+i])!=0 && c!='>' && (c!='/' || z[n+i+1]!='>') ){ - if( argc>mxARG-3 ){ - argc = mxARG-3; - } - argv[argc] = &z[n+i]; - j = 0; - while( (c=z[n+i+j])!=0 && !isspace(c) && c!='>' - && c!='=' && (c!='/' || z[n+i+j+1]!='>') ){ - j++; - } - arglen[argc] = j; - if( c==0 ){ goto incomplete; } - i += j; - while( isspace(c) ){ - i++; - c = z[n+i]; - } - if( c==0 ){ goto incomplete; } - argc++; - if( c!='=' ){ - argv[argc] = ""; - arglen[argc] = 0; - argc++; - continue; - } - i++; - c = z[n+i]; - while( isspace(c) ){ - i++; - c = z[n+i]; - } - if( c==0 ){ goto incomplete; } - if( c=='\'' || c=='"' ){ - int cQuote = c; - i++; - argv[argc] = &z[n+i]; - for(j=0; (c=z[n+i+j])!=0 && c!=cQuote; j++){} - if( c==0 ){ goto incomplete; } - arglen[argc] = j; - i += j+1; - TestPoint(0); - }else{ - argv[argc] = &z[n+i]; - for(j=0; (c=z[n+i+j])!=0 && !isspace(c) && c!='>'; j++){} - if( c==0 ){ goto incomplete; } - arglen[argc] = j; - i += j; - } - argc++; - while( isspace(z[n+i]) ){ i++; } - } - if( c=='/' ){ - i++; - c = z[n+i]; - selfClose = 1; - }else{ - selfClose = 0; - } - if( c==0 ){ goto incomplete; } - for(j=0; j<i+1; j++){ - iCol = NextColumn(iCol, z[n+j]); - } - n += i + 1; - - /* Lookup the markup name in the hash table - */ - if( !isInit ){ - HtmlHashInit(); - isInit = 1; - } - c = argv[0][arglen[0]]; - argv[0][arglen[0]] = 0; - h = HtmlHash(argv[0]); - for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){ - if( stricmp(pMap->zName,argv[0])==0 ){ break; } - TestPoint(0); - } - argv[0][arglen[0]] = c; - if( pMap==0 ){ continue; } /* Ignore unknown markup */ - -makeMarkupEntry: - /* Construct a HtmlMarkup entry for this markup. - */ - if( pMap->extra ){ - nByte = pMap->extra; - }else if( argc==1 ){ - nByte = sizeof(HtmlBaseElement); - }else{ - nByte = sizeof(HtmlMarkupElement); - } - if( argc>1 ){ - nByte += sizeof(char*) * argc; - for(j=1; j<argc; j++){ - nByte += arglen[j] + 1; - } - } - pElem = HtmlAlloc( nByte ); - if( pElem==0 ){ goto incomplete; } - memset(pElem,0,nByte); - pElem->base.type = pMap->type; - pElem->base.count = argc - 1; - if( argc>1 ){ - if( pMap->extra ){ - pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra]; - }else{ - pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1]; - } - zBuf = (char*)&pElem->markup.argv[argc]; - for(j=1; j<argc; j++){ - pElem->markup.argv[j-1] = zBuf; - zBuf += arglen[j] + 1; - sprintf(pElem->markup.argv[j-1],"%.*s",arglen[j],argv[j]); - HtmlTranslateEscapes(pElem->markup.argv[j-1]); - if( (j&1)==1 ){ - ToLower(pElem->markup.argv[j-1]); - } - } - pElem->markup.argv[argc-1] = 0; - } - - /* The new markup has now be constructed in pElem. But before - ** appending to the list, check to see if there is a special - ** handler for this markup type. - */ - if( p->zHandler[pMap->type] ){ - Tcl_DString str; - Tcl_DStringInit(&str); - Tcl_DStringAppend(&str, p->zHandler[pMap->type], -1); - Tcl_DStringAppendElement(&str, pMap->zName); - Tcl_DStringStartSublist(&str); - for(j=0; j<argc-1; j++){ - Tcl_DStringAppendElement(&str, pElem->markup.argv[j]); - } - Tcl_DStringEndSublist(&str); - HtmlFree(pElem); - HtmlLock(p); - Tcl_GlobalEval(p->interp, Tcl_DStringValue(&str)); - Tcl_DStringFree(&str); - if( HtmlUnlock(p) ){ return 0; } - - /* Tricky, tricky. The callback might have caused the p->zText - ** pointer to change, so renew our copy of that pointer. The - ** callback might also have cleared or destroyed the widget. - ** If so, abort this routine. - */ - z = p->zText; - if( z==0 || p->tkwin==0 ){ - n = 0; - iCol = 0; - goto incomplete; - } - continue; - } - - /* No special handler for this markup. Just append it to the - ** list of all tokens. - */ - AppendElement(p,pElem); - switch( pMap->type ){ - case Html_PLAINTEXT: - case Html_LISTING: - case Html_XMP: - case Html_TEXTAREA: - p->iPlaintext = pMap->type; - break; - case Html_STYLE: - case Html_SCRIPT: - p->pScript = (HtmlScript*)pElem; - break; - default: - break; - } - - /* If this is self-closing markup (ex: <br/> or <img/>) then - ** synthesize a closing token. - */ - if( selfClose && argv[0][0]!='/' - && strcmp(&pMap[1].zName[1],pMap->zName)==0 ){ - selfClose = 0; - pMap++; - argc = 1; - goto makeMarkupEntry; - } - } - } -incomplete: - p->iCol = iCol; - return n; -} -/************************** End HTML Tokenizer Code ***************************/ - -/* -** Append text to the tokenizer engine. -** -** This routine (actually the Tokenize() subroutine that is called -** by this routine) may invoke a callback procedure which could delete -** the HTML widget. -*/ -void HtmlTokenizerAppend(HtmlWidget *htmlPtr, const char *zText){ - int len = strlen(zText); - if( htmlPtr->nText==0 ){ - htmlPtr->nAlloc = len + 100; - htmlPtr->zText = HtmlAlloc( htmlPtr->nAlloc ); - TestPoint(0); - }else if( htmlPtr->nText + len >= htmlPtr->nAlloc ){ - htmlPtr->nAlloc += len + 100; - htmlPtr->zText = HtmlRealloc( htmlPtr->zText, htmlPtr->nAlloc ); - TestPoint(0); - } - if( htmlPtr->zText==0 ){ - htmlPtr->nText = 0; - UNTESTED; - return; - } - strcpy(&htmlPtr->zText[htmlPtr->nText], zText); - htmlPtr->nText += len; - htmlPtr->nComplete = Tokenize(htmlPtr); -} - -/* -** This routine takes a text representation of a token, converts -** it into an HtmlElement structure and inserts it immediately -** prior to pToken. If pToken==0, then the newly created HtmlElement -** is appended. -** -** This routine does nothing to resize, restyle, relayout or redisplay -** the HTML. That is the calling routines responsibility. -** -** Return 0 if successful. Return non-zero if zType is not a known -** markup name. -*/ -int HtmlInsertToken( - HtmlWidget *htmlPtr, /* The widget into which the token is inserted */ - HtmlElement *pToken, /* Insert before this. Append if pToken==0 */ - char *zType, /* Type of markup. Ex: "/a" or "table" */ - char *zArgs /* List of arguments */ -){ - HtmlTokenMap *pMap; /* For searching the markup name hash table */ - int h; /* The hash on zType */ - HtmlElement *pElem; /* The new element */ - int nByte; /* How many bytes to allocate */ - int i; /* Loop counter */ - - if( !isInit ){ - HtmlHashInit(); - isInit = 1; - TestPoint(0); - }else{ - TestPoint(0); - } - h = HtmlHash(zType); - for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){ - if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; } - TestPoint(0); - } - if( pMap==0 ){ TestPoint(0); return 1; } - - if( zArgs==0 || *zArgs==0 ){ - /* Special case of no arguments. This is a lot easier... */ - nByte = pMap->extra ? pMap->extra : sizeof(HtmlBaseElement); - nByte += strlen(zType); - pElem = HtmlAlloc( nByte ); - if( pElem==0 ){ TestPoint(0); return 1; } - memset(pElem,0,nByte); - pElem->base.type = pMap->type; - TestPoint(0); - }else{ - /* The general case. There are arguments that need to be parsed - ** up. This is slower, but we gotta do it. - */ - int argc; - const char **argv; - char *zBuf; - - if( Tcl_SplitList(htmlPtr->interp, zArgs, &argc, (const char***)&argv)!=TCL_OK ){ - TestPoint(0); - return 1; - } - if( pMap->extra ){ - nByte = pMap->extra; - TestPoint(0); - }else{ - nByte = sizeof(HtmlMarkupElement); - TestPoint(0); - } - nByte += sizeof(char*)*(argc+1) + strlen(zArgs) + argc + 2; - pElem = HtmlAlloc( nByte ); - if( pElem==0 ){ - HtmlFree(argv); - TestPoint(0); - return 1; - } - memset(pElem,0,nByte); - pElem->base.type = pMap->type; - pElem->base.count = argc; - if( pMap->extra ){ - pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra]; - TestPoint(0); - }else{ - pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1]; - TestPoint(0); - } - zBuf = (char*)&pElem->markup.argv[argc]; - for(i=1; i<argc; i++){ - pElem->markup.argv[i-1] = zBuf; - zBuf += strlen(argv[i]) + 1; - strcpy(pElem->markup.argv[i-1],argv[i]); - TestPoint(0); - } - pElem->markup.argv[argc-1] = 0; - HtmlFree(argv); - TestPoint(0); - } - if( pToken ){ - pElem->base.pNext = pToken; - pElem->base.pPrev = pToken->base.pPrev; - if( pToken->base.pPrev ){ - pToken->base.pPrev->pNext = pElem; - TestPoint(0); - }else{ - htmlPtr->pFirst = pElem; - TestPoint(0); - } - pToken->base.pPrev = pElem; - htmlPtr->nToken++; - }else{ - AppendElement(htmlPtr,pElem); - TestPoint(0); - } - return 0; -} - -/* -** Convert a markup name into a type integer -*/ -int HtmlNameToType(const char *zType){ - HtmlTokenMap *pMap; /* For searching the markup name hash table */ - int h; /* The hash on zType */ - - if( !isInit ){ - HtmlHashInit(); - isInit = 1; - TestPoint(0); - }else{ - TestPoint(0); - } - h = HtmlHash(zType); - for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){ - if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; } - TestPoint(0); - } - return pMap ? pMap->type : Html_Unknown; -} - -/* -** Convert a type into a symbolic name -*/ -const char *HtmlTypeToName(int type){ - if( type>=Html_A && type<=Html_EndXMP ){ - HtmlTokenMap *pMap = apMap[type - Html_A]; - TestPoint(0); - return pMap->zName; - }else{ - TestPoint(0); - return "???"; - } -} - -/* -** For debugging purposes, print information about a token -*/ -char *HtmlTokenName(HtmlElement *p){ -#ifdef DEBUG - static char zBuf[200]; - int j; - char *zName; - - if( p==0 ) return "NULL"; - switch( p->base.type ){ - case Html_Text: - sprintf(zBuf,"\"%.*s\"",p->base.count,p->text.zText); - break; - case Html_Space: - if( p->base.flags & HTML_NewLine ){ - sprintf(zBuf,"\"\\n\""); - }else{ - sprintf(zBuf,"\" \""); - } - break; - case Html_Block: - if( p->block.n>0 ){ - int n = p->block.n; - if( n>150 ) n = 150; - sprintf(zBuf,"<Block z=\"%.*s\">", n, p->block.z); - }else{ - sprintf(zBuf,"<Block>"); - } - break; - default: - if( p->base.type >= HtmlMarkupMap[0].type - && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){ - zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName; - }else{ - zName = "Unknown"; - } - sprintf(zBuf,"<%s",zName); - for(j=1; j<p->base.count; j += 2){ - sprintf(&zBuf[strlen(zBuf)]," %s=%s", - p->markup.argv[j-1],p->markup.argv[j]); - } - strcat(zBuf,">"); - break; - } - return zBuf; -#else - return 0; -#endif -} - -/* -** Return all tokens between the two elements as a Tcl list. -*/ -void HtmlTclizeList(Tcl_Interp *interp, HtmlElement *p, HtmlElement *pEnd){ - Tcl_DString str; - int i; - char *zName; - char zLine[100]; - - Tcl_DStringInit(&str); - while( p && p!=pEnd ){ - switch( p->base.type ){ - case Html_Block: - break; - case Html_Text: - Tcl_DStringStartSublist(&str); - Tcl_DStringAppendElement(&str,"Text"); - Tcl_DStringAppendElement(&str, p->text.zText); - Tcl_DStringEndSublist(&str); - break; - case Html_Space: - sprintf(zLine,"Space %d %d", - p->base.count, (p->base.flags & HTML_NewLine)!=0); - Tcl_DStringAppendElement(&str,zLine); - break; - case Html_Unknown: - Tcl_DStringAppendElement(&str,"Unknown"); - break; - default: - Tcl_DStringStartSublist(&str); - Tcl_DStringAppendElement(&str,"Markup"); - if( p->base.type >= HtmlMarkupMap[0].type - && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){ - zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName; - }else{ - zName = "Unknown"; - } - Tcl_DStringAppendElement(&str, zName); - for(i=0; i<p->base.count; i++){ - Tcl_DStringAppendElement(&str, p->markup.argv[i]); - } - Tcl_DStringEndSublist(&str); - break; - } - p = p->pNext; - } - Tcl_DStringResult(interp, &str); -} - -/* -** Print a list of tokens -*/ -#ifdef DEBUG -void HtmlPrintList(HtmlElement *p, HtmlElement *pEnd){ - while( p && p!=pEnd ){ - if( p->base.type==Html_Block ){ - char *z = p->block.z; - int n = p->block.n; - if( n==0 || z==0 ){ - n = 1; - z = ""; - } - printf("Block 0x%08x flags=%02x cnt=%d x=%d..%d y=%d..%d z=\"%.*s\"\n", - (int)p, p->base.flags, p->base.count, p->block.left, p->block.right, - p->block.top, p->block.bottom, n, z); - }else{ - printf("Token 0x%08x font=%2d color=%2d align=%d flags=0x%04x name=%s\n", - (int)p, p->base.style.font, p->base.style.color, - p->base.style.align, p->base.style.flags, HtmlTokenName(p)); - } - p = p->pNext; - } -} -#endif |