Merge commit '333069975324629e46636ca439dc7edb838449a3' as 'tkhtml1'

author: William Joye <wjoye@cfa.harvard.edu> 2016-10-17 15:25:29 (GMT)
committer: William Joye <wjoye@cfa.harvard.edu> 2016-10-17 15:25:29 (GMT)
commit: a62a72569befa24bccfc2ef198be80f3cb1e94e9 (patch)
tree: 5b80188ca040ec1942d235792ffc4905cd2ecf27 /tkhtml1/src/htmlparse.c
parent: 9967eb08e8dd098ffec7e70fa72549e5f7dc1e42 (diff)
parent: 333069975324629e46636ca439dc7edb838449a3 (diff)
download: blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.zip
blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.tar.gz
blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.tar.bz2
1 files changed, 1181 insertions, 0 deletions
diff --git a/tkhtml1/src/htmlparse.c b/tkhtml1/src/htmlparse.c
new file mode 100644
index 0000000..4511005
--- /dev/null
+++ b/tkhtml1/src/htmlparse.c
@@ -0,0 +1,1181 @@
+/*
+** A tokenizer that converts raw HTML into a linked list of HTML elements.
+**
+** Copyright (C) 1997-2000 D. Richard Hipp
+**
+** This library is free software; you can redistribute it and/or
+** modify it under the terms of the GNU Library General Public
+** License as published by the Free Software Foundation; either
+** version 2 of the License, or (at your option) any later version.
+**
+** This library is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+** Library General Public License for more details.
+** 
+** You should have received a copy of the GNU Library General Public
+** License along with this library; if not, write to the
+** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+** Boston, MA  02111-1307, USA.
+**
+** Author contact information:
+**   drh@acm.org
+**   http://www.hwaci.com/drh/
+*/
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <tk.h>
+#include "htmlparse.h"
+
+/****************** Begin Escape Sequence Translator *************/
+/*
+** The next section of code implements routines used to translate
+** the '&' escape sequences of SGML to individual characters.
+** Examples:
+**
+**         &amp;          &
+**         &lt;           <
+**         &gt;           >
+**         &nbsp;         nonbreakable space
+*/
+
+/* Each escape sequence is recorded as an instance of the following
+** structure
+*/
+struct sgEsc {
+  char *zName;            /* The name of this escape sequence.  ex:  "amp" */
+  char  value[8];         /* The value for this sequence.       ex:  "&" */
+  struct sgEsc *pNext;    /* Next sequence with the same hash on zName */
+};
+
+/* The following is a table of all escape sequences.  Add new sequences
+** by adding entries to this table.
+*/
+static struct sgEsc esc_sequences[] = {
+  { "quot",      "\"",    0 },
+  { "amp",       "&",     0 },
+  { "lt",        "<",     0 },
+  { "gt",        ">",     0 },
+  { "nbsp",      " ",     0 },
+  { "iexcl",     "\241",  0 },
+  { "cent",      "\242",  0 },
+  { "pound",     "\243",  0 },
+  { "curren",    "\244",  0 },
+  { "yen",       "\245",  0 },
+  { "brvbar",    "\246",  0 },
+  { "sect",      "\247",  0 },
+  { "uml",       "\250",  0 },
+  { "copy",      "\251",  0 },
+  { "ordf",      "\252",  0 },
+  { "laquo",     "\253",  0 },
+  { "not",       "\254",  0 },
+  { "shy",       "\255",  0 },
+  { "reg",       "\256",  0 },
+  { "macr",      "\257",  0 },
+  { "deg",       "\260",  0 },
+  { "plusmn",    "\261",  0 },
+  { "sup2",      "\262",  0 },
+  { "sup3",      "\263",  0 },
+  { "acute",     "\264",  0 },
+  { "micro",     "\265",  0 },
+  { "para",      "\266",  0 },
+  { "middot",    "\267",  0 },
+  { "cedil",     "\270",  0 },
+  { "sup1",      "\271",  0 },
+  { "ordm",      "\272",  0 },
+  { "raquo",     "\273",  0 },
+  { "frac14",    "\274",  0 },
+  { "frac12",    "\275",  0 },
+  { "frac34",    "\276",  0 },
+  { "iquest",    "\277",  0 },
+  { "Agrave",    "\300",  0 },
+  { "Aacute",    "\301",  0 },
+  { "Acirc",     "\302",  0 },
+  { "Atilde",    "\303",  0 },
+  { "Auml",      "\304",  0 },
+  { "Aring",     "\305",  0 },
+  { "AElig",     "\306",  0 },
+  { "Ccedil",    "\307",  0 },
+  { "Egrave",    "\310",  0 },
+  { "Eacute",    "\311",  0 },
+  { "Ecirc",     "\312",  0 },
+  { "Euml",      "\313",  0 },
+  { "Igrave",    "\314",  0 },
+  { "Iacute",    "\315",  0 },
+  { "Icirc",     "\316",  0 },
+  { "Iuml",      "\317",  0 },
+  { "ETH",       "\320",  0 },
+  { "Ntilde",    "\321",  0 },
+  { "Ograve",    "\322",  0 },
+  { "Oacute",    "\323",  0 },
+  { "Ocirc",     "\324",  0 },
+  { "Otilde",    "\325",  0 },
+  { "Ouml",      "\326",  0 },
+  { "times",     "\327",  0 },
+  { "Oslash",    "\330",  0 },
+  { "Ugrave",    "\331",  0 },
+  { "Uacute",    "\332",  0 },
+  { "Ucirc",     "\333",  0 },
+  { "Uuml",      "\334",  0 },
+  { "Yacute",    "\335",  0 },
+  { "THORN",     "\336",  0 },
+  { "szlig",     "\337",  0 },
+  { "agrave",    "\340",  0 },
+  { "aacute",    "\341",  0 },
+  { "acirc",     "\342",  0 },
+  { "atilde",    "\343",  0 },
+  { "auml",      "\344",  0 },
+  { "aring",     "\345",  0 },
+  { "aelig",     "\346",  0 },
+  { "ccedil",    "\347",  0 },
+  { "egrave",    "\350",  0 },
+  { "eacute",    "\351",  0 },
+  { "ecirc",     "\352",  0 },
+  { "euml",      "\353",  0 },
+  { "igrave",    "\354",  0 },
+  { "iacute",    "\355",  0 },
+  { "icirc",     "\356",  0 },
+  { "iuml",      "\357",  0 },
+  { "eth",       "\360",  0 },
+  { "ntilde",    "\361",  0 },
+  { "ograve",    "\362",  0 },
+  { "oacute",    "\363",  0 },
+  { "ocirc",     "\364",  0 },
+  { "otilde",    "\365",  0 },
+  { "ouml",      "\366",  0 },
+  { "divide",    "\367",  0 },
+  { "oslash",    "\370",  0 },
+  { "ugrave",    "\371",  0 },
+  { "uacute",    "\372",  0 },
+  { "ucirc",     "\373",  0 },
+  { "uuml",      "\374",  0 },
+  { "yacute",    "\375",  0 },
+  { "thorn",     "\376",  0 },
+  { "yuml",      "\377",  0 },
+};
+
+/* The size of the handler hash table.  For best results this should
+** be a prime number which is about the same size as the number of
+** escape sequences known to the system. */
+#define ESC_HASH_SIZE (sizeof(esc_sequences)/sizeof(esc_sequences[0])+7)
+
+/* The hash table 
+**
+** If the name of an escape sequences hashes to the value H, then
+** apEscHash[H] will point to a linked list of Esc structures, one of
+** which will be the Esc structure for that escape sequence.
+*/
+static struct sgEsc *apEscHash[ESC_HASH_SIZE];
+
+/* Hash a escape sequence name.  The value returned is an integer
+** between 0 and ESC_HASH_SIZE-1, inclusive.
+*/
+static int EscHash(const char *zName){
+  int h = 0;      /* The hash value to be returned */
+  char c;         /* The next character in the name being hashed */
+
+  while( (c=*zName)!=0 ){
+    h = h<<5 ^ h ^ c;
+    zName++;
+    TestPoint(0);
+  }
+  if( h<0 ){
+    h = -h;
+    TestPoint(0);
+  }else{
+    TestPoint(0);
+  }
+  return h % ESC_HASH_SIZE;
+}
+
+#ifdef TEST
+/* 
+** Compute the longest and average collision chain length for the
+** escape sequence hash table
+*/
+static void EscHashStats(void){
+  int i;
+  int sum = 0;
+  int max = 0;
+  int cnt;
+  int notempty = 0;
+  struct sgEsc *p;
+
+  for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[0]); i++){
+    cnt = 0; 
+    p = apEscHash[i];
+    if( p ) notempty++;
+    while( p ){
+      cnt++;
+      p = p->pNext;
+    }
+    sum += cnt;
+    if( cnt>max ) max = cnt;
+  }
+  printf("Longest chain=%d  avg=%g  slots=%d  empty=%d (%g%%)\n",
+     max,(double)sum/(double)notempty, i, i-notempty, 
+     100.0*(i-notempty)/(double)i);
+}
+#endif
+
+/* Initialize the escape sequence hash table
+*/
+static void EscInit(void){
+  int i;  /* For looping thru the list of escape sequences */
+  int h;  /* The hash on a sequence */
+
+  for(i=0; i<sizeof(esc_sequences)/sizeof(esc_sequences[i]); i++){
+/* #ifdef TCL_UTF_MAX */
+#if 0
+    {
+      int c = esc_sequences[i].value[0];
+      Tcl_UniCharToUtf(c, esc_sequences[i].value);
+    }
+#endif
+    h = EscHash(esc_sequences[i].zName);
+    esc_sequences[i].pNext = apEscHash[h];
+    apEscHash[h] = &esc_sequences[i];
+    TestPoint(0);
+  }
+#ifdef TEST
+  EscHashStats();
+#endif
+}
+
+/*
+** This table translates the non-standard microsoft characters between
+** 0x80 and 0x9f into plain ASCII so that the characters will be visible
+** on Unix systems.  Care is taken to translate the characters
+** into values less than 0x80, to avoid UTF-8 problems.
+*/
+#ifndef __WIN32__
+static char acMsChar[] = {
+  /* 0x80 */ 'C',
+  /* 0x81 */ ' ',
+  /* 0x82 */ ',',
+  /* 0x83 */ 'f',
+  /* 0x84 */ '"',
+  /* 0x85 */ '.',
+  /* 0x86 */ '*',
+  /* 0x87 */ '*',
+  /* 0x88 */ '^',
+  /* 0x89 */ '%',
+  /* 0x8a */ 'S',
+  /* 0x8b */ '<',
+  /* 0x8c */ 'O',
+  /* 0x8d */ ' ',
+  /* 0x8e */ 'Z',
+  /* 0x8f */ ' ',
+  /* 0x90 */ ' ',
+  /* 0x91 */ '\'',
+  /* 0x92 */ '\'',
+  /* 0x93 */ '"',
+  /* 0x94 */ '"',
+  /* 0x95 */ '*',
+  /* 0x96 */ '-',
+  /* 0x97 */ '-',
+  /* 0x98 */ '~',
+  /* 0x99 */ '@',
+  /* 0x9a */ 's',
+  /* 0x9b */ '>',
+  /* 0x9c */ 'o',
+  /* 0x9d */ ' ',
+  /* 0x9e */ 'z',
+  /* 0x9f */ 'Y',
+};
+#endif
+
+/* Translate escape sequences in the string "z".  "z" is overwritten
+** with the translated sequence.
+**
+** Unrecognized escape sequences are unaltered.
+**
+** Example:
+**
+**      input =    "AT&amp;T &gt MCI"
+**      output =   "AT&T > MCI"
+*/
+LOCAL void HtmlTranslateEscapes(char *z){
+  int from;   /* Read characters from this position in z[] */
+  int to;     /* Write characters into this position in z[] */
+  int h;      /* A hash on the escape sequence */
+  struct sgEsc *p;   /* For looping down the escape sequence collision chain */
+  static int isInit = 0;   /* True after initialization */
+
+  from = to = 0;
+  if( !isInit ){
+    EscInit();
+    isInit = 1;
+  }
+  while( z[from] ){
+    if( z[from]=='&' ){
+      if( z[from+1]=='#' ){
+        int i = from + 2;
+        int v = 0;
+        while( isdigit(z[i]) ){
+          v = v*10 + z[i] - '0';
+          i++;
+        }
+        if( z[i]==';' ){ i++; }
+
+        /* On Unix systems, translate the non-standard microsoft
+        ** characters in the range of 0x80 to 0x9f into something
+        ** we can see.
+        */
+#ifndef __WIN32__
+        if( v>=0x80 && v<0xa0 ){
+          v = acMsChar[v&0x1f];
+        }
+#endif
+        /* Put the character in the output stream in place of
+        ** the "&#000;".  How we do this depends on whether or
+        ** not we are using UTF-8.
+        */
+#ifdef TCL_UTF_MAX
+        {
+          int j, n;
+          char value[8];
+          n = Tcl_UniCharToUtf(v,value);
+          for(j=0; j<n; j++){
+            z[to++] = value[j];
+          }
+        }
+#else
+        z[to++] = v;
+#endif
+        from = i;
+      }else{
+        int i = from+1;
+        int c;
+        while( z[i] && isalnum(z[i]) ){ TestPoint(0); i++; }
+        c = z[i];
+        z[i] = 0;
+        h = EscHash(&z[from+1]);
+        p = apEscHash[h];
+        while( p && strcmp(p->zName,&z[from+1])!=0 ){ 
+          p = p->pNext; 
+        }
+        z[i] = c;
+        if( p ){
+          int j;
+          for(j=0; p->value[j]; j++){
+            z[to++] = p->value[j];
+          }
+          from = i;
+          if( c==';' ){
+            from++;
+          }
+        }else{
+          z[to++] = z[from++];
+        }
+      }
+
+    /* On UNIX systems, look for the non-standard microsoft characters 
+    ** between 0x80 and 0x9f and translate them into printable ASCII
+    ** codes.  Separate algorithms are required to do this for plain
+    ** ascii and for utf-8.
+    */
+#ifndef __WIN32__
+#ifdef TCL_UTF_MAX
+    }else if( (z[from]&0x80)!=0 ){
+      Tcl_UniChar c;
+      int n;
+      n = Tcl_UtfToUniChar(&z[from], &c);
+      if( c>=0x80 && c<0xa0 ){
+        z[to++] = acMsChar[c & 0x1f];
+        from += n;
+      }else{
+        while( n-- ) z[to++] = z[from++];
+      }
+#else /* if !defined(TCL_UTF_MAX) */
+    }else if( ((unsigned char)z[from])>=0x80 && ((unsigned char)z[from])<0xa0 ){
+      z[to++] = acMsChar[z[from++]&0x1f];
+#endif /* TCL_UTF_MAX */
+#endif /* __WIN32__ */
+    }else{
+      z[to++] = z[from++];
+      TestPoint(0);
+    }
+  }
+  z[to] = 0;
+}
+/******************* End Escape Sequence Translator ***************/
+
+/******************* Begin HTML tokenizer code *******************/
+/*
+** The following variable becomes TRUE when the markup hash table
+** (stored in HtmlMarkupMap[]) is initialized.
+*/
+static int isInit = 0;
+
+/* The hash table for HTML markup names.
+**
+** If an HTML markup name hashes to H, then apMap[H] will point to
+** a linked list of sgMap structure, one of which will describe the
+** the particular markup (if it exists.)
+*/
+static HtmlTokenMap *apMap[HTML_MARKUP_HASH_SIZE];
+
+/* Hash a markup name
+**
+** HTML markup is case insensitive, so this function will give the
+** same hash regardless of the case of the markup name.
+**
+** The value returned is an integer between 0 and HTML_MARKUP_HASH_SIZE-1,
+** inclusive.
+*/
+static int HtmlHash(const char *zName){
+  int h = 0;
+  char c;
+  while( (c=*zName)!=0 ){
+    if( isupper(c) ){
+      c = tolower(c);
+    }
+    h = h<<5 ^ h ^ c;
+    zName++;
+  }
+  if( h<0 ){
+    h = -h;
+  }
+  return h % HTML_MARKUP_HASH_SIZE;
+}
+
+#ifdef TEST
+/* 
+** Compute the longest and average collision chain length for the
+** markup hash table
+*/
+static void HtmlHashStats(void){
+  int i;
+  int sum = 0;
+  int max = 0;
+  int cnt;
+  int notempty = 0;
+  struct sgMap *p;
+
+  for(i=0; i<HTML_MARKUP_COUNT; i++){
+    cnt = 0; 
+    p = apMap[i];
+    if( p ) notempty++;
+    while( p ){
+      cnt++;
+      p = p->pCollide;
+    }
+    sum += cnt;
+    if( cnt>max ) max = cnt;
+    
+  }
+  printf("longest chain=%d  avg=%g  slots=%d  empty=%d (%g%%)\n",
+    max, (double)sum/(double)notempty, i, i-notempty,
+    100.0*(i-notempty)/(double)i);
+}
+#endif
+
+/* Initialize the escape sequence hash table
+*/
+static void HtmlHashInit(void){
+  int i;  /* For looping thru the list of markup names */
+  int h;  /* The hash on a markup name */
+
+  for(i=0; i<HTML_MARKUP_COUNT; i++){
+    h = HtmlHash(HtmlMarkupMap[i].zName);
+    HtmlMarkupMap[i].pCollide = apMap[h];
+    apMap[h] = &HtmlMarkupMap[i];
+  }
+#ifdef TEST
+  HtmlHashStats();
+#endif
+}
+
+/*
+** Append the given HtmlElement to the tokenizers list of elements
+*/
+static void AppendElement(HtmlWidget *p, HtmlElement *pElem){
+  pElem->base.pNext = 0;
+  pElem->base.pPrev = p->pLast;
+  if( p->pFirst==0 ){
+    p->pFirst = pElem;
+  }else{
+    p->pLast->base.pNext = pElem;
+  }
+  p->pLast = pElem;
+  p->nToken++;
+}
+
+/*
+** Compute the new column index following the given character.
+*/
+static int NextColumn(int iCol, char c){
+  switch( c ){
+    case '\n': return 0;
+    case '\t': return (iCol | 7) + 1;
+    default:   return iCol+1;
+  }
+  /* NOT REACHED */
+}
+
+/*
+** Convert a string to all lower-case letters.
+*/
+static void ToLower(char *z){
+  while( *z ){
+    if( isupper(*z) ) *z = tolower(*z);
+    z++;
+  }
+}
+
+/* Process as much of the input HTML as possible.  Construct new
+** HtmlElement structures and appended them to the list.  Return
+** the number of characters actually processed.
+**
+** This routine may invoke a callback procedure which could delete
+** the HTML widget. 
+**
+** This routine is not reentrant for the same HTML widget.  To
+** prevent reentrancy (during a callback), the p->iCol field is
+** set to a negative number.  This is a flag to future invocations
+** not to reentry this routine.  The p->iCol field is restored
+** before exiting, of course.
+*/
+static int Tokenize(
+  HtmlWidget *p        /* The HTML widget doing the parsing */
+){
+  char *z;             /* The input HTML text */
+  int c;               /* The next character of input */
+  int n;               /* Number of characters processed so far */
+  int iCol;            /* Column of input */
+  int i, j;            /* Loop counters */
+  int h;               /* Result from HtmlHash() */
+  int nByte;           /* Space allocated for a single HtmlElement */
+  HtmlElement *pElem;  /* A new HTML element */
+  int selfClose;       /* True for content free elements.  Ex:  <br/> */
+  int argc;            /* The number of arguments on a markup */
+  HtmlTokenMap *pMap;  /* For searching the markup name hash table */
+  char *zBuf;          /* For handing out buffer space */
+# define mxARG 200     /* Maximum number of parameters in a single markup */
+  char *argv[mxARG];   /* Pointers to each markup argument. */
+  int arglen[mxARG];   /* Length of each markup argument */
+
+  iCol = p->iCol;
+  n = p->nComplete;
+  z = p->zText;
+  if( iCol<0 ){ TestPoint(0); return n; }   /* Prevents recursion */
+  p->iCol = -1;
+  while( (c=z[n])!=0 ){
+    if( p->pScript ){
+      /* We are in the middle of <SCRIPT>...</SCRIPT>.  Just look for
+      ** the </SCRIPT> markup.  (later:)  Treat <STYLE>...</STYLE> the
+      ** same way. */
+      HtmlScript *pScript = p->pScript;
+      char *zEnd;
+      int nEnd;
+      if( pScript->markup.base.type==Html_SCRIPT ){
+        zEnd = "</script>";
+        nEnd = 9;
+      }else{
+        zEnd = "</style>";
+        nEnd = 8;
+      }
+      if( pScript->zScript==0 ){
+        pScript->zScript = &z[n];
+        pScript->nScript = 0;
+      }
+      for(i=n+pScript->nScript; z[i]; i++){
+        if( z[i]=='<' && z[i+1]=='/' && strnicmp(&z[i],zEnd,nEnd)==0 ){
+          pScript->nScript = i - n;
+          p->pScript = 0;
+          n = i+nEnd;
+          break;
+        }
+      }
+      if( p->pScript ){
+        pScript->nScript = i - n;
+      }
+      continue;
+    }else if( isspace(c) ){
+      /* White space */
+      for(i=0; (c=z[n+i])!=0 && isspace(c) && c!='\n' && c!='\r'; i++){}
+      if( c=='\r' && z[n+i+1]=='\n' ){ i++; }
+      pElem = HtmlAlloc( sizeof(HtmlSpaceElement) );
+      if( pElem==0 ){ goto incomplete; }
+      pElem->base.type = Html_Space;
+      if( c=='\n' || c=='\r' ){
+        pElem->base.flags = HTML_NewLine;
+        pElem->base.count = 1;
+        i++;
+        iCol = 0;
+        TestPoint(0);
+      }else{
+        int iColStart = iCol;
+        pElem->base.flags = 0;
+        for(j=0; j<i; j++){
+          iCol = NextColumn(iCol, z[n+j]);
+          TestPoint(0);
+        }
+        pElem->base.count = iCol - iColStart;
+      }
+      AppendElement(p,pElem);
+      n += i;
+    }else if( c!='<' || p->iPlaintext!=0 || 
+      (!isalpha(z[n+1]) && z[n+1]!='/' && z[n+1]!='!' && z[n+1]!='?') ){
+      /* Ordinary text */
+      for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='<'; i++){}
+      if( c==0 ){ TestPoint(0); goto incomplete; }
+      if( p->iPlaintext!=0 && z[n]=='<' ){
+        switch( p->iPlaintext ){
+          case Html_LISTING:
+            if( i>=10 && strnicmp(&z[n],"</listing>",10)==0 ){
+              p->iPlaintext = 0;
+              goto doMarkup;
+            }
+            break;
+          case Html_XMP:
+            if( i>=6 && strnicmp(&z[n],"</xmp>",6)==0 ){
+              p->iPlaintext = 0;
+              goto doMarkup;
+            }
+            break;
+          case Html_TEXTAREA:
+            if( i>=11 && strnicmp(&z[n],"</textarea>",11)==0 ){
+              p->iPlaintext = 0;
+              goto doMarkup;
+            }
+            break;
+          default:
+            break;
+        }
+      }
+      nByte = sizeof(HtmlTextElement) + i;
+      pElem = HtmlAlloc( nByte );
+      if( pElem==0 ){ goto incomplete; }
+      memset(pElem,0,nByte);
+      pElem->base.type = Html_Text;
+      sprintf(pElem->text.zText,"%.*s",i,&z[n]);
+      AppendElement(p,pElem);
+      if( p->iPlaintext==0 || p->iPlaintext==Html_TEXTAREA ){
+        HtmlTranslateEscapes(pElem->text.zText);
+      }
+      pElem->base.count = strlen(pElem->text.zText);
+      n += i;
+      iCol += i;
+    }else if( strncmp(&z[n],"<!--",4)==0 ){
+      /* An HTML comment.  Just skip it. */
+      for(i=4; z[n+i]; i++){
+        if( z[n+i]=='-' && strncmp(&z[n+i],"-->",3)==0 ){ break; }
+      }
+      if( z[n+i]==0 ){ TestPoint(0); goto incomplete; }
+      for(j=0; j<i+3; j++){
+        iCol = NextColumn(iCol, z[n+j]);
+      }
+      n += i + 3;
+    }else{
+      /* Markup.
+      **
+      ** First get the name of the markup
+      */
+doMarkup:
+      argc = 1;
+      argv[0] = &z[n+1];
+      for(i=1; (c=z[n+i])!=0 && !isspace(c) && c!='>' && (i<2 || c!='/'); i++){}
+      arglen[0] = i - 1;
+      if( c==0 ){ goto incomplete; }
+
+      /*
+      ** Now parse up the arguments
+      */
+      while( isspace(z[n+i]) ){ i++; }
+      while( (c=z[n+i])!=0 && c!='>' && (c!='/' || z[n+i+1]!='>') ){
+        if( argc>mxARG-3 ){
+          argc = mxARG-3;
+        }
+        argv[argc] = &z[n+i];
+        j = 0;
+        while( (c=z[n+i+j])!=0 && !isspace(c) && c!='>' 
+               && c!='=' && (c!='/' || z[n+i+j+1]!='>') ){
+          j++;
+        }
+        arglen[argc] = j;
+        if( c==0 ){  goto incomplete; }
+        i += j;
+        while( isspace(c) ){
+          i++;
+          c = z[n+i];
+        }
+        if( c==0 ){ goto incomplete; }
+        argc++;
+        if( c!='=' ){
+          argv[argc] = "";
+          arglen[argc] = 0;
+          argc++;
+          continue;
+        }
+        i++;
+        c = z[n+i];
+        while( isspace(c) ){
+          i++;
+          c = z[n+i];
+        }
+        if( c==0 ){ goto incomplete; }
+        if( c=='\'' || c=='"' ){
+          int cQuote = c;
+          i++;
+          argv[argc] = &z[n+i];
+          for(j=0; (c=z[n+i+j])!=0 && c!=cQuote; j++){}
+          if( c==0 ){ goto incomplete; }
+          arglen[argc] = j;
+          i += j+1;
+          TestPoint(0);
+        }else{
+          argv[argc] = &z[n+i];
+          for(j=0; (c=z[n+i+j])!=0 && !isspace(c) && c!='>'; j++){}
+          if( c==0 ){ goto incomplete; }
+          arglen[argc] = j;
+          i += j;
+        }
+        argc++;
+        while( isspace(z[n+i]) ){ i++; }
+      }
+      if( c=='/' ){
+        i++;
+        c = z[n+i];
+        selfClose = 1;
+      }else{
+        selfClose = 0;
+      }
+      if( c==0 ){ goto incomplete; }
+      for(j=0; j<i+1; j++){
+        iCol = NextColumn(iCol, z[n+j]);
+      }
+      n += i + 1;
+
+      /* Lookup the markup name in the hash table 
+      */
+      if( !isInit ){
+        HtmlHashInit();
+        isInit = 1;
+      }
+      c = argv[0][arglen[0]];
+      argv[0][arglen[0]] = 0;
+      h = HtmlHash(argv[0]);
+      for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+        if( stricmp(pMap->zName,argv[0])==0 ){ break; }
+        TestPoint(0);
+      }
+      argv[0][arglen[0]] = c;
+      if( pMap==0 ){ continue; }  /* Ignore unknown markup */
+
+makeMarkupEntry:
+      /* Construct a HtmlMarkup entry for this markup.
+      */ 
+      if( pMap->extra ){
+        nByte = pMap->extra;
+      }else if( argc==1 ){
+        nByte = sizeof(HtmlBaseElement);
+      }else{
+        nByte = sizeof(HtmlMarkupElement);
+      }
+      if( argc>1 ){
+        nByte += sizeof(char*) * argc;
+        for(j=1; j<argc; j++){
+          nByte += arglen[j] + 1;
+        }
+      }
+      pElem = HtmlAlloc( nByte );
+      if( pElem==0 ){ goto incomplete; }
+      memset(pElem,0,nByte);
+      pElem->base.type = pMap->type;
+      pElem->base.count = argc - 1;
+      if( argc>1 ){
+        if( pMap->extra ){
+          pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
+        }else{
+          pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
+        }
+        zBuf = (char*)&pElem->markup.argv[argc];
+        for(j=1; j<argc; j++){
+          pElem->markup.argv[j-1] = zBuf;
+          zBuf += arglen[j] + 1;
+          sprintf(pElem->markup.argv[j-1],"%.*s",arglen[j],argv[j]);
+          HtmlTranslateEscapes(pElem->markup.argv[j-1]);
+          if( (j&1)==1 ){
+            ToLower(pElem->markup.argv[j-1]);
+          }
+        }
+        pElem->markup.argv[argc-1] = 0;
+      }
+
+      /* The new markup has now be constructed in pElem.  But before
+      ** appending to the list, check to see if there is a special
+      ** handler for this markup type.
+      */
+      if( p->zHandler[pMap->type] ){
+        Tcl_DString str;
+        Tcl_DStringInit(&str);
+        Tcl_DStringAppend(&str, p->zHandler[pMap->type], -1);
+        Tcl_DStringAppendElement(&str, pMap->zName);
+        Tcl_DStringStartSublist(&str);
+        for(j=0; j<argc-1; j++){
+          Tcl_DStringAppendElement(&str, pElem->markup.argv[j]);
+        }
+        Tcl_DStringEndSublist(&str);
+        HtmlFree(pElem);
+        HtmlLock(p);
+        Tcl_GlobalEval(p->interp, Tcl_DStringValue(&str));
+        Tcl_DStringFree(&str);
+        if( HtmlUnlock(p) ){ return 0; }
+
+        /* Tricky, tricky.  The callback might have caused the p->zText
+        ** pointer to change, so renew our copy of that pointer.  The
+        ** callback might also have cleared or destroyed the widget.
+        ** If so, abort this routine.
+        */
+        z = p->zText;
+        if( z==0 || p->tkwin==0 ){
+          n = 0;
+          iCol = 0;
+          goto incomplete;
+        }
+        continue;
+      }
+
+      /* No special handler for this markup.  Just append it to the
+      ** list of all tokens.
+      */
+      AppendElement(p,pElem);
+      switch( pMap->type ){
+        case Html_PLAINTEXT:
+        case Html_LISTING:
+        case Html_XMP:
+        case Html_TEXTAREA:
+          p->iPlaintext = pMap->type;
+          break;
+        case Html_STYLE:
+        case Html_SCRIPT:
+          p->pScript = (HtmlScript*)pElem;
+          break;
+        default:
+          break;
+      }
+
+      /* If this is self-closing markup (ex: <br/> or <img/>) then
+      ** synthesize a closing token.
+      */
+      if( selfClose && argv[0][0]!='/' 
+      && strcmp(&pMap[1].zName[1],pMap->zName)==0 ){
+        selfClose = 0;
+        pMap++;
+        argc = 1;
+        goto makeMarkupEntry;
+      }
+    }
+  }
+incomplete:
+  p->iCol = iCol;
+  return n;
+}
+/************************** End HTML Tokenizer Code ***************************/
+
+/*
+** Append text to the tokenizer engine.
+**
+** This routine (actually the Tokenize() subroutine that is called
+** by this routine) may invoke a callback procedure which could delete
+** the HTML widget. 
+*/
+void HtmlTokenizerAppend(HtmlWidget *htmlPtr, const char *zText){
+  int len = strlen(zText);
+  if( htmlPtr->nText==0 ){
+    htmlPtr->nAlloc = len + 100;
+    htmlPtr->zText = HtmlAlloc( htmlPtr->nAlloc );
+    TestPoint(0);
+  }else if( htmlPtr->nText + len >= htmlPtr->nAlloc ){
+    htmlPtr->nAlloc += len + 100;
+    htmlPtr->zText = HtmlRealloc( htmlPtr->zText, htmlPtr->nAlloc );
+    TestPoint(0);
+  }
+  if( htmlPtr->zText==0 ){
+    htmlPtr->nText = 0;
+    UNTESTED;
+    return;
+  }
+  strcpy(&htmlPtr->zText[htmlPtr->nText], zText);
+  htmlPtr->nText += len;
+  htmlPtr->nComplete = Tokenize(htmlPtr);
+}
+
+/*
+** This routine takes a text representation of a token, converts
+** it into an HtmlElement structure and inserts it immediately 
+** prior to pToken.  If pToken==0, then the newly created HtmlElement
+** is appended.
+**
+** This routine does nothing to resize, restyle, relayout or redisplay
+** the HTML.  That is the calling routines responsibility.
+**
+** Return 0 if successful.  Return non-zero if zType is not a known
+** markup name.
+*/
+int HtmlInsertToken(
+  HtmlWidget *htmlPtr,     /* The widget into which the token is inserted */
+  HtmlElement *pToken,     /* Insert before this.  Append if pToken==0 */
+  char *zType,             /* Type of markup.  Ex: "/a" or "table" */
+  char *zArgs              /* List of arguments */
+){
+  HtmlTokenMap *pMap;     /* For searching the markup name hash table */
+  int h;                   /* The hash on zType */
+  HtmlElement *pElem;      /* The new element */
+  int nByte;               /* How many bytes to allocate */
+  int i;                   /* Loop counter */
+
+  if( !isInit ){
+    HtmlHashInit();
+    isInit = 1;
+    TestPoint(0);
+  }else{
+    TestPoint(0);
+  }
+  h = HtmlHash(zType);
+  for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+    if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
+    TestPoint(0);
+  }
+  if( pMap==0 ){ TestPoint(0); return 1; }
+
+  if( zArgs==0 || *zArgs==0 ){
+    /* Special case of no arguments.  This is a lot easier... */
+    nByte = pMap->extra ? pMap->extra : sizeof(HtmlBaseElement);
+    nByte += strlen(zType);
+    pElem = HtmlAlloc( nByte );
+    if( pElem==0 ){ TestPoint(0); return 1; }
+    memset(pElem,0,nByte);
+    pElem->base.type = pMap->type;
+    TestPoint(0);
+  }else{
+    /* The general case.  There are arguments that need to be parsed
+    ** up.  This is slower, but we gotta do it.
+    */
+    int argc;
+    const char **argv;
+    char *zBuf;
+
+    if( Tcl_SplitList(htmlPtr->interp, zArgs, &argc, (const char***)&argv)!=TCL_OK ){
+      TestPoint(0);
+      return 1;
+    }
+    if( pMap->extra ){
+      nByte = pMap->extra;
+      TestPoint(0);
+    }else{
+      nByte = sizeof(HtmlMarkupElement);
+      TestPoint(0);
+    }
+    nByte += sizeof(char*)*(argc+1) + strlen(zArgs) + argc + 2;
+    pElem = HtmlAlloc( nByte );
+    if( pElem==0 ){
+      HtmlFree(argv);
+      TestPoint(0);
+      return 1;
+    }
+    memset(pElem,0,nByte);
+    pElem->base.type = pMap->type;
+    pElem->base.count = argc;
+    if( pMap->extra ){
+      pElem->markup.argv = (char**)&((char*)pElem)[pMap->extra];
+      TestPoint(0);
+    }else{
+      pElem->markup.argv = (char**)&((HtmlMarkupElement*)pElem)[1];
+      TestPoint(0);
+    }
+    zBuf = (char*)&pElem->markup.argv[argc];
+    for(i=1; i<argc; i++){
+      pElem->markup.argv[i-1] = zBuf;
+      zBuf += strlen(argv[i]) + 1;
+      strcpy(pElem->markup.argv[i-1],argv[i]);
+      TestPoint(0);
+    }
+    pElem->markup.argv[argc-1] = 0;
+    HtmlFree(argv);
+    TestPoint(0);
+  }
+  if( pToken ){
+    pElem->base.pNext = pToken;
+    pElem->base.pPrev = pToken->base.pPrev;
+    if( pToken->base.pPrev ){
+      pToken->base.pPrev->pNext = pElem;
+      TestPoint(0);
+    }else{
+      htmlPtr->pFirst = pElem;
+      TestPoint(0);
+    }
+    pToken->base.pPrev = pElem;
+    htmlPtr->nToken++;
+  }else{
+    AppendElement(htmlPtr,pElem);
+    TestPoint(0);
+  }
+  return 0;
+}
+
+/*
+** Convert a markup name into a type integer
+*/
+int HtmlNameToType(const char *zType){
+  HtmlTokenMap *pMap;     /* For searching the markup name hash table */
+  int h;                   /* The hash on zType */
+
+  if( !isInit ){
+    HtmlHashInit();
+    isInit = 1;
+    TestPoint(0);
+  }else{
+    TestPoint(0);
+  }
+  h = HtmlHash(zType);
+  for(pMap = apMap[h]; pMap; pMap=pMap->pCollide){
+    if( stricmp(pMap->zName,zType)==0 ){ TestPoint(0); break; }
+    TestPoint(0);
+  }
+  return pMap ? pMap->type : Html_Unknown;
+}
+
+/*
+** Convert a type into a symbolic name
+*/
+const char *HtmlTypeToName(int type){
+  if( type>=Html_A && type<=Html_EndXMP ){
+    HtmlTokenMap *pMap = apMap[type - Html_A];
+    TestPoint(0);
+    return pMap->zName;
+  }else{
+    TestPoint(0);
+    return "???";
+  }
+}
+
+/*
+** For debugging purposes, print information about a token
+*/
+char *HtmlTokenName(HtmlElement *p){
+#ifdef DEBUG
+  static char zBuf[200];
+  int j;
+  char *zName;
+
+  if( p==0 ) return "NULL";
+  switch( p->base.type ){
+  case Html_Text:
+    sprintf(zBuf,"\"%.*s\"",p->base.count,p->text.zText);
+    break;
+  case Html_Space:
+    if( p->base.flags & HTML_NewLine ){
+      sprintf(zBuf,"\"\\n\"");
+    }else{
+      sprintf(zBuf,"\" \"");
+    }
+    break;
+  case Html_Block:
+    if( p->block.n>0 ){
+      int n = p->block.n;
+      if( n>150 ) n = 150;
+      sprintf(zBuf,"<Block z=\"%.*s\">", n, p->block.z);
+    }else{
+      sprintf(zBuf,"<Block>");
+    }
+    break;
+  default:
+    if( p->base.type >= HtmlMarkupMap[0].type 
+    && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
+      zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
+    }else{
+      zName = "Unknown";
+    }
+    sprintf(zBuf,"<%s",zName);
+    for(j=1; j<p->base.count; j += 2){
+      sprintf(&zBuf[strlen(zBuf)]," %s=%s",
+              p->markup.argv[j-1],p->markup.argv[j]);
+    }
+    strcat(zBuf,">");
+    break;
+  }
+  return zBuf;
+#else
+  return 0;
+#endif
+}
+
+/*
+** Return all tokens between the two elements as a Tcl list.
+*/
+void HtmlTclizeList(Tcl_Interp *interp, HtmlElement *p, HtmlElement *pEnd){
+  Tcl_DString str;
+  int i;
+  char *zName;
+  char zLine[100];
+
+  Tcl_DStringInit(&str);
+  while( p && p!=pEnd ){
+    switch( p->base.type ){
+      case Html_Block:
+        break;
+      case Html_Text:
+        Tcl_DStringStartSublist(&str);
+        Tcl_DStringAppendElement(&str,"Text");
+        Tcl_DStringAppendElement(&str, p->text.zText);
+        Tcl_DStringEndSublist(&str);
+        break;
+      case Html_Space:
+        sprintf(zLine,"Space %d %d",
+          p->base.count, (p->base.flags & HTML_NewLine)!=0);
+        Tcl_DStringAppendElement(&str,zLine);
+        break;
+      case Html_Unknown:
+        Tcl_DStringAppendElement(&str,"Unknown");
+        break;
+      default:
+        Tcl_DStringStartSublist(&str);
+        Tcl_DStringAppendElement(&str,"Markup");
+        if( p->base.type >= HtmlMarkupMap[0].type 
+         && p->base.type <= HtmlMarkupMap[HTML_MARKUP_COUNT-1].type ){
+          zName = HtmlMarkupMap[p->base.type - HtmlMarkupMap[0].type].zName;
+        }else{
+          zName = "Unknown";
+        }
+        Tcl_DStringAppendElement(&str, zName);
+        for(i=0; i<p->base.count; i++){
+          Tcl_DStringAppendElement(&str, p->markup.argv[i]);
+        }
+        Tcl_DStringEndSublist(&str);
+        break;
+    }
+    p = p->pNext;
+  }
+  Tcl_DStringResult(interp, &str);
+}
+
+/*
+** Print a list of tokens
+*/
+#ifdef DEBUG
+void HtmlPrintList(HtmlElement *p, HtmlElement *pEnd){
+  while( p && p!=pEnd ){
+    if( p->base.type==Html_Block ){
+      char *z = p->block.z;
+      int n = p->block.n;
+      if( n==0 || z==0 ){
+        n = 1;
+        z = "";
+      }
+      printf("Block 0x%08x flags=%02x cnt=%d x=%d..%d y=%d..%d z=\"%.*s\"\n",
+        (int)p, p->base.flags, p->base.count, p->block.left, p->block.right,
+        p->block.top, p->block.bottom, n, z);
+    }else{
+      printf("Token 0x%08x font=%2d color=%2d align=%d flags=0x%04x name=%s\n",
+        (int)p, p->base.style.font, p->base.style.color,
+        p->base.style.align, p->base.style.flags, HtmlTokenName(p));
+    }
+    p = p->pNext;
+  }
+}
+#endif
author	William Joye <wjoye@cfa.harvard.edu>	2016-10-17 15:25:29 (GMT)
committer	William Joye <wjoye@cfa.harvard.edu>	2016-10-17 15:25:29 (GMT)
commit	a62a72569befa24bccfc2ef198be80f3cb1e94e9 (patch)
tree	5b80188ca040ec1942d235792ffc4905cd2ecf27 /tkhtml1/src/htmlparse.c
parent	9967eb08e8dd098ffec7e70fa72549e5f7dc1e42 (diff)
parent	333069975324629e46636ca439dc7edb838449a3 (diff)
download	blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.zip blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.tar.gz blt-a62a72569befa24bccfc2ef198be80f3cb1e94e9.tar.bz2