diff options
author | William Joye <wjoye@cfa.harvard.edu> | 2019-01-02 20:21:27 (GMT) |
---|---|---|
committer | William Joye <wjoye@cfa.harvard.edu> | 2019-01-02 20:21:27 (GMT) |
commit | 62d64d9d13ee541e11854265c2817e540f50b59a (patch) | |
tree | 0b07fb95ebdc8de3e700226b1210a34c98759748 /tclxml/tclxml-tcl | |
parent | 8f5c6286538d2f0c762edeacf21f85eeeed1fe0b (diff) | |
parent | 73444843b18bba4a07922722b11ac3a7fe32a11b (diff) | |
download | blt-62d64d9d13ee541e11854265c2817e540f50b59a.zip blt-62d64d9d13ee541e11854265c2817e540f50b59a.tar.gz blt-62d64d9d13ee541e11854265c2817e540f50b59a.tar.bz2 |
Merge commit '73444843b18bba4a07922722b11ac3a7fe32a11b' as 'tclxml'
Diffstat (limited to 'tclxml/tclxml-tcl')
-rwxr-xr-x | tclxml/tclxml-tcl/sgml-8.0.tcl | 143 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/sgml-8.1.tcl | 143 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/sgmlparser.tcl | 2814 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/tclparser-8.0.tcl | 359 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/tclparser-8.1.tcl | 614 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/xml-8.0.tcl | 92 | ||||
-rwxr-xr-x | tclxml/tclxml-tcl/xml-8.1.tcl | 135 | ||||
-rw-r--r-- | tclxml/tclxml-tcl/xml__tcl.tcl | 272 | ||||
-rw-r--r-- | tclxml/tclxml-tcl/xmldep.tcl | 179 | ||||
-rw-r--r-- | tclxml/tclxml-tcl/xpath.tcl | 362 |
10 files changed, 5113 insertions, 0 deletions
diff --git a/tclxml/tclxml-tcl/sgml-8.0.tcl b/tclxml/tclxml-tcl/sgml-8.0.tcl new file mode 100755 index 0000000..f1179cf --- /dev/null +++ b/tclxml/tclxml-tcl/sgml-8.0.tcl @@ -0,0 +1,143 @@ +# sgml-8.0.tcl -- +# +# This file provides generic parsing services for SGML-based +# languages, namely HTML and XML. +# This file supports Tcl 8.0 characters and regular expressions. +# +# NB. It is a misnomer. There is no support for parsing +# arbitrary SGML as such. +# +# Copyright (c) 1998,1999 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: sgml-8.0.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require -exact Tcl 8.0 + +package provide sgml 1.9 + +namespace eval sgml { + + # Convenience routine + proc cl x { + return "\[$x\]" + } + + # Define various regular expressions + + # Character classes + variable Char \t\n\r\ -\xFF + variable BaseChar A-Za-z + variable Letter $BaseChar + variable Digit 0-9 + variable CombiningChar {} + variable Extender {} + variable Ideographic {} + + # white space + variable Wsp " \t\r\n" + variable noWsp [cl ^$Wsp] + + # Various XML names + variable NameChar \[-$Letter$Digit._:$CombiningChar$Extender\] + variable Name \[_:$BaseChar$Ideographic\]$NameChar* + variable Names ${Name}(?:$Wsp$Name)* + variable Nmtoken $NameChar+ + variable Nmtokens ${Nmtoken}(?:$Wsp$Nmtoken)* + + # table of predefined entities for XML + + variable EntityPredef + array set EntityPredef { + lt < gt > amp & quot \" apos ' + } + +} + +# These regular expressions are defined here once for better performance + +namespace eval sgml { + variable Wsp + + # Watch out for case-sensitivity + + set attlist_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(#REQUIRED|#IMPLIED) + set attlist_enum_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*\\(([cl ^)]*)\\)[cl $Wsp]*("([cl ^")])")? ;# " + set attlist_fixed_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(#FIXED)[cl $Wsp]*([cl ^$Wsp]+) + + set param_entity_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^"$Wsp]*)[cl $Wsp]*"([cl ^"]*)" + + set notation_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(.*) + +} + +### Utility procedures + +# sgml::noop -- +# +# A do-nothing proc +# +# Arguments: +# args arguments +# +# Results: +# Nothing. + +proc sgml::noop args { + return 0 +} + +# sgml::identity -- +# +# Identity function. +# +# Arguments: +# a arbitrary argument +# +# Results: +# $a + +proc sgml::identity a { + return $a +} + +# sgml::Error -- +# +# Throw an error +# +# Arguments: +# args arguments +# +# Results: +# Error return condition. + +proc sgml::Error args { + uplevel return -code error [list $args] +} + +### Following procedures are based on html_library + +# sgml::zapWhite -- +# +# Convert multiple white space into a single space. +# +# Arguments: +# data plain text +# +# Results: +# As above + +proc sgml::zapWhite data { + regsub -all "\[ \t\r\n\]+" $data { } data + return $data +} + +proc sgml::Boolean value { + regsub {1|true|yes|on} $value 1 value + regsub {0|false|no|off} $value 0 value + return $value +} + diff --git a/tclxml/tclxml-tcl/sgml-8.1.tcl b/tclxml/tclxml-tcl/sgml-8.1.tcl new file mode 100755 index 0000000..60748bb --- /dev/null +++ b/tclxml/tclxml-tcl/sgml-8.1.tcl @@ -0,0 +1,143 @@ +# sgml-8.1.tcl -- +# +# This file provides generic parsing services for SGML-based +# languages, namely HTML and XML. +# This file supports Tcl 8.1 characters and regular expressions. +# +# NB. It is a misnomer. There is no support for parsing +# arbitrary SGML as such. +# +# Copyright (c) 1998-2003 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: sgml-8.1.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require Tcl 8.1 + +package provide sgml 1.9 + +namespace eval sgml { + + # Convenience routine + proc cl x { + return "\[$x\]" + } + + # Define various regular expressions + + # Character classes + variable Char \t\n\r\ -\uD7FF\uE000-\uFFFD\u10000-\u10FFFF + variable BaseChar \u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E\u0180-\u01C3\u01CD-\u01F0\u01F4-\u01F5\u01FA-\u0217\u0250-\u02A8\u02BB-\u02C1\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D6\u03DA\u03DC\u03DE\u03E0\u03E2-\u03F3\u0401-\u040C\u040E-\u044F\u0451-\u045C\u045E-\u0481\u0490-\u04C4\u04C7-\u04C8\u04CB-\u04CC\u04D0-\u04EB\u04EE-\u04F5\u04F8-\u04F9\u0531-\u0556\u0559\u0561-\u0586\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A\u0641-\u064A\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE\u06D0-\u06D3\u06D5\u06E5-\u06E6\u0905-\u0939\u093D\u0958-\u0961\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8B\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B36-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60-\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0-\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60-\u0D61\u0E01-\u0E2E\u0E30\u0E32-\u0E33\u0E40-\u0E45\u0E81-\u0E82\u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EAE\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0F40-\u0F47\u0F49-\u0F69\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102-\u1103\u1105-\u1107\u1109\u110B-\u110C\u110E-\u1112\u113C\u113E\u1140\u114C\u114E\u1150\u1154-\u1155\u1159\u115F-\u1161\u1163\u1165\u1167\u1169\u116D-\u116E\u1172-\u1173\u1175\u119E\u11A8\u11AB\u11AE-\u11AF\u11B7-\u11B8\u11BA\u11BC-\u11C2\u11EB\u11F0\u11F9\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A-\u212B\u212E\u2180-\u2182\u3041-\u3094\u30A1-\u30FA\u3105-\u312C\uAC00-\uD7A3 + variable Ideographic \u4E00-\u9FA5\u3007\u3021-\u3029 + variable CombiningChar \u0300-\u0345\u0360-\u0361\u0483-\u0486\u0591-\u05A1\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u064B-\u0652\u0670\u06D6-\u06DC\u06DD-\u06DF\u06E0-\u06E4\u06E7-\u06E8\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094C\u094D\u0951-\u0954\u0962-\u0963\u0981-\u0983\u09BC\u09BE\u09BF\u09C0-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7\u09E2-\u09E3\u0A02\u0A3C\u0A3E\u0A3F\u0A40-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47-\u0B48\u0B4B-\u0B4D\u0B56-\u0B57\u0B82-\u0B83\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56\u0C82-\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5-\u0CD6\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86-\u0F8B\u0F90-\u0F95\u0F97\u0F99-\u0FAD\u0FB1-\u0FB7\u0FB9\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A + variable Digit \u0030-\u0039\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29 + variable Extender \u00B7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005\u3031-\u3035\u309D-\u309E\u30FC-\u30FE + variable Letter $BaseChar|$Ideographic + + # white space + variable Wsp " \t\r\n" + variable noWsp [cl ^$Wsp] + + # Various XML names + variable NameChar \[-$Letter$Digit._:$CombiningChar$Extender\] + variable Name \[_:$BaseChar$Ideographic\]$NameChar* + variable Names ${Name}(?:$Wsp$Name)* + variable Nmtoken $NameChar+ + variable Nmtokens ${Nmtoken}(?:$Wsp$Nmtoken)* + + # table of predefined entities for XML + + variable EntityPredef + array set EntityPredef { + lt < gt > amp & quot \" apos ' + } + +} + +# These regular expressions are defined here once for better performance + +namespace eval sgml { + variable Wsp + + # Watch out for case-sensitivity + + set attlist_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(#REQUIRED|#IMPLIED) + set attlist_enum_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*\\(([cl ^)]*)\\)[cl $Wsp]*("([cl ^")]*)")? ;# " + set attlist_fixed_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(#FIXED)[cl $Wsp]*([cl ^$Wsp]+) + + set param_entity_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^"$Wsp]*)[cl $Wsp]*"([cl ^"]*)" + + set notation_exp [cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*(.*) + +} + +### Utility procedures + +# sgml::noop -- +# +# A do-nothing proc +# +# Arguments: +# args arguments +# +# Results: +# Nothing. + +proc sgml::noop args { + return 0 +} + +# sgml::identity -- +# +# Identity function. +# +# Arguments: +# a arbitrary argument +# +# Results: +# $a + +proc sgml::identity a { + return $a +} + +# sgml::Error -- +# +# Throw an error +# +# Arguments: +# args arguments +# +# Results: +# Error return condition. + +proc sgml::Error args { + uplevel return -code error [list $args] +} + +### Following procedures are based on html_library + +# sgml::zapWhite -- +# +# Convert multiple white space into a single space. +# +# Arguments: +# data plain text +# +# Results: +# As above + +proc sgml::zapWhite data { + regsub -all "\[ \t\r\n\]+" $data { } data + return $data +} + +proc sgml::Boolean value { + regsub {1|true|yes|on} $value 1 value + regsub {0|false|no|off} $value 0 value + return $value +} + diff --git a/tclxml/tclxml-tcl/sgmlparser.tcl b/tclxml/tclxml-tcl/sgmlparser.tcl new file mode 100755 index 0000000..2677a44 --- /dev/null +++ b/tclxml/tclxml-tcl/sgmlparser.tcl @@ -0,0 +1,2814 @@ +# sgmlparser.tcl -- +# +# This file provides the generic part of a parser for SGML-based +# languages, namely HTML and XML. +# +# NB. It is a misnomer. There is no support for parsing +# arbitrary SGML as such. +# +# See sgml.tcl for variable definitions. +# +# Copyright (c) 2008 Explain +# http://www.explain.com.au/ +# Copyright (c) 1998-2003 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: sgmlparser.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require sgml 1.9 + +package require uri 1.1 + +package provide sgmlparser 1.1 + +namespace eval sgml { + namespace export tokenise parseEvent + + namespace export parseDTD + + # NB. Most namespace variables are defined in sgml-8.[01].tcl + # to account for differences between versions of Tcl. + # This especially includes the regular expressions used. + + variable ParseEventNum + if {![info exists ParseEventNum]} { + set ParseEventNum 0 + } + variable ParseDTDnum + if {![info exists ParseDTDNum]} { + set ParseDTDNum 0 + } + + variable declExpr [cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*) + variable EntityExpr [cl $::sgml::Wsp]*(%[cl $::sgml::Wsp])?[cl $::sgml::Wsp]*($::sgml::Name)[cl $::sgml::Wsp]+(.*) + + #variable MarkupDeclExpr <([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)> + #variable MarkupDeclSub "} {\\1} {\\2} {\\3} {" + variable MarkupDeclExpr <[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^>]*)> + variable MarkupDeclSub "\} {\\1} {\\2} \{" + + variable ExternalEntityExpr ^(PUBLIC|SYSTEM)[cl $::sgml::Wsp]+("|')(.*?)\\2([cl $::sgml::Wsp]+("|')(.*?)\\2)?([cl $::sgml::Wsp]+NDATA[cl $::sgml::Wsp]+($::xml::Name))?\$ + + variable StdOptions + array set StdOptions [list \ + -elementstartcommand [namespace current]::noop \ + -elementendcommand [namespace current]::noop \ + -characterdatacommand [namespace current]::noop \ + -processinginstructioncommand [namespace current]::noop \ + -externalentitycommand {} \ + -xmldeclcommand [namespace current]::noop \ + -doctypecommand [namespace current]::noop \ + -commentcommand [namespace current]::noop \ + -entitydeclcommand [namespace current]::noop \ + -unparsedentitydeclcommand [namespace current]::noop \ + -parameterentitydeclcommand [namespace current]::noop \ + -notationdeclcommand [namespace current]::noop \ + -elementdeclcommand [namespace current]::noop \ + -attlistdeclcommand [namespace current]::noop \ + -paramentityparsing 1 \ + -defaultexpandinternalentities 1 \ + -startdoctypedeclcommand [namespace current]::noop \ + -enddoctypedeclcommand [namespace current]::noop \ + -entityreferencecommand {} \ + -warningcommand [namespace current]::noop \ + -errorcommand [namespace current]::Error \ + -final 1 \ + -validate 0 \ + -baseuri {} \ + -name {} \ + -cmd {} \ + -emptyelement [namespace current]::EmptyElement \ + -parseattributelistcommand [namespace current]::noop \ + -parseentitydeclcommand [namespace current]::noop \ + -normalize 1 \ + -internaldtd {} \ + -reportempty 0 \ + -ignorewhitespace 0 \ + ] +} + +# sgml::tokenise -- +# +# Transform the given HTML/XML text into a Tcl list. +# +# Arguments: +# sgml text to tokenize +# elemExpr RE to recognise tags +# elemSub transform for matched tags +# args options +# +# Valid Options: +# -internaldtdvariable +# -final boolean True if no more data is to be supplied +# -statevariable varName Name of a variable used to store info +# +# Results: +# Returns a Tcl list representing the document. + +proc sgml::tokenise {sgml elemExpr elemSub args} { + array set options {-final 1} + array set options $args + set options(-final) [Boolean $options(-final)] + + # If the data is not final then there must be a variable to store + # unused data. + if {!$options(-final) && ![info exists options(-statevariable)]} { + return -code error {option "-statevariable" required if not final} + } + + # Pre-process stage + # + # Extract the internal DTD subset, if any + + catch {upvar #0 $options(-internaldtdvariable) dtd} + if {[regexp {<!DOCTYPE[^[<]+\[([^]]+)\]} $sgml discard dtd]} { + regsub {(<!DOCTYPE[^[<]+)(\[[^]]+\])} $sgml {\1\&xml:intdtd;} sgml + } + + # Protect Tcl special characters + regsub -all {([{}\\])} $sgml {\\\1} sgml + + # Do the translation + + if {[info exists options(-statevariable)]} { + # Mats: Several rewrites here to handle -final 0 option. + # If any cached unparsed xml (state(leftover)), prepend it. + upvar #0 $options(-statevariable) state + if {[string length $state(leftover)]} { + regsub -all $elemExpr $state(leftover)$sgml $elemSub sgml + set state(leftover) {} + } else { + regsub -all $elemExpr $sgml $elemSub sgml + } + set sgml "{} {} {} \{$sgml\}" + + # Performance note (Tcl 8.0): + # Use of lindex, lreplace will cause parsing to list object + + # This RE only fixes chopped inside tags, not chopped text. + if {[regexp {^([^<]*)(<[^>]*$)} [lindex $sgml end] x text rest]} { + set sgml [lreplace $sgml end end $text] + # Mats: unmatched stuff means that it is chopped off. Cache it for next round. + set state(leftover) $rest + } + + # Patch from bug report #596959, Marshall Rose + if {[string compare [lindex $sgml 4] ""]} { + set sgml [linsert $sgml 0 {} {} {} {} {}] + } + + } else { + + # Performance note (Tcl 8.0): + # In this case, no conversion to list object is performed + + # Mats: This fails if not -final and $sgml is chopped off right in a tag. + regsub -all $elemExpr $sgml $elemSub sgml + set sgml "{} {} {} \{$sgml\}" + } + + return $sgml + +} + +# sgml::parseEvent -- +# +# Produces an event stream for a XML/HTML document, +# given the Tcl list format returned by tokenise. +# +# This procedure checks that the document is well-formed, +# and throws an error if the document is found to be not +# well formed. Warnings are passed via the -warningcommand script. +# +# The procedure only check for well-formedness, +# no DTD is required. However, facilities are provided for entity expansion. +# +# Arguments: +# sgml Instance data, as a Tcl list. +# args option/value pairs +# +# Valid Options: +# -final Indicates end of document data +# -validate Boolean to enable validation +# -baseuri URL for resolving relative URLs +# -elementstartcommand Called when an element starts +# -elementendcommand Called when an element ends +# -characterdatacommand Called when character data occurs +# -entityreferencecommand Called when an entity reference occurs +# -processinginstructioncommand Called when a PI occurs +# -externalentitycommand Called for an external entity reference +# +# -xmldeclcommand Called when the XML declaration occurs +# -doctypecommand Called when the document type declaration occurs +# -commentcommand Called when a comment occurs +# -entitydeclcommand Called when a parsed entity is declared +# -unparsedentitydeclcommand Called when an unparsed external entity is declared +# -parameterentitydeclcommand Called when a parameter entity is declared +# -notationdeclcommand Called when a notation is declared +# -elementdeclcommand Called when an element is declared +# -attlistdeclcommand Called when an attribute list is declared +# -paramentityparsing Boolean to enable/disable parameter entity substitution +# -defaultexpandinternalentities Boolean to enable/disable expansion of entities declared in internal DTD subset +# +# -startdoctypedeclcommand Called when the Doc Type declaration starts (see also -doctypecommand) +# -enddoctypedeclcommand Called when the Doc Type declaration ends (see also -doctypecommand) +# +# -errorcommand Script to evaluate for a fatal error +# -warningcommand Script to evaluate for a reportable warning +# -statevariable global state variable +# -normalize whether to normalize names +# -reportempty whether to include an indication of empty elements +# -ignorewhitespace whether to automatically strip whitespace +# +# Results: +# The various callback scripts are invoked. +# Returns empty string. +# +# BUGS: +# If command options are set to empty string then they should not be invoked. + +proc sgml::parseEvent {sgml args} { + variable Wsp + variable noWsp + variable Nmtoken + variable Name + variable ParseEventNum + variable StdOptions + + array set options [array get StdOptions] + catch {array set options $args} + + # Mats: + # If the data is not final then there must be a variable to persistently store the parse state. + if {!$options(-final) && ![info exists options(-statevariable)]} { + return -code error {option "-statevariable" required if not final} + } + + foreach {opt value} [array get options *command] { + if {[string compare $opt "-externalentitycommand"] && ![string length $value]} { + set options($opt) [namespace current]::noop + } + } + + if {![info exists options(-statevariable)]} { + set options(-statevariable) [namespace current]::ParseEvent[incr ParseEventNum] + } + if {![info exists options(entities)]} { + set options(entities) [namespace current]::Entities$ParseEventNum + array set $options(entities) [array get [namespace current]::EntityPredef] + } + if {![info exists options(extentities)]} { + set options(extentities) [namespace current]::ExtEntities$ParseEventNum + } + if {![info exists options(parameterentities)]} { + set options(parameterentities) [namespace current]::ParamEntities$ParseEventNum + } + if {![info exists options(externalparameterentities)]} { + set options(externalparameterentities) [namespace current]::ExtParamEntities$ParseEventNum + } + if {![info exists options(elementdecls)]} { + set options(elementdecls) [namespace current]::ElementDecls$ParseEventNum + } + if {![info exists options(attlistdecls)]} { + set options(attlistdecls) [namespace current]::AttListDecls$ParseEventNum + } + if {![info exists options(notationdecls)]} { + set options(notationdecls) [namespace current]::NotationDecls$ParseEventNum + } + if {![info exists options(namespaces)]} { + set options(namespaces) [namespace current]::Namespaces$ParseEventNum + } + + # For backward-compatibility + catch {set options(-baseuri) $options(-baseurl)} + + # Choose an external entity resolver + + if {![string length $options(-externalentitycommand)]} { + if {$options(-validate)} { + set options(-externalentitycommand) [namespace code ResolveEntity] + } else { + set options(-externalentitycommand) [namespace code noop] + } + } + + upvar #0 $options(-statevariable) state + upvar #0 $options(entities) entities + + # Mats: + # The problem is that the state is not maintained when -final 0 ! + # I've switched back to an older version here. + + if {![info exists state(line)]} { + # Initialise the state variable + array set state { + mode normal + haveXMLDecl 0 + haveDocElement 0 + inDTD 0 + context {} + stack {} + line 0 + defaultNS {} + defaultNSURI {} + } + } + + foreach {tag close param text} $sgml { + + # Keep track of lines in the input + incr state(line) [regsub -all \n $param {} discard] + incr state(line) [regsub -all \n $text {} discard] + + # If the current mode is cdata or comment then we must undo what the + # regsub has done to reconstitute the data + + set empty {} + switch $state(mode) { + comment { + # This had "[string length $param] && " as a guard - + # can't remember why :-( + if {[regexp ([cl ^-]*)--\$ $tag discard comm1]} { + # end of comment (in tag) + set tag {} + set close {} + set state(mode) normal + DeProtect1 $options(-commentcommand) $state(commentdata)<$comm1 + unset state(commentdata) + } elseif {[regexp ([cl ^-]*)--\$ $param discard comm1]} { + # end of comment (in attributes) + DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag>$comm1 + unset state(commentdata) + set tag {} + set param {} + set close {} + set state(mode) normal + } elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm1 text]} { + # end of comment (in text) + DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag$param>$comm1 + unset state(commentdata) + set tag {} + set param {} + set close {} + set state(mode) normal + } else { + # comment continues + append state(commentdata) <$close$tag$param>$text + continue + } + } + cdata { + if {[string length $param] && [regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $tag discard cdata1]} { + # end of CDATA (in tag) + PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$cdata1] + set text [subst -novariable -nocommand $text] + set tag {} + unset state(cdata) + set state(mode) normal + } elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $param discard cdata1]} { + # end of CDATA (in attributes) + PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$cdata1] + set text [subst -novariable -nocommand $text] + set tag {} + set param {} + unset state(cdata) + set state(mode) normal + } elseif {[regexp (.*)\]\][cl $Wsp]*>(.*) $text discard cdata1 text]} { + # end of CDATA (in text) + PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$param>$cdata1] + set text [subst -novariable -nocommand $text] + set tag {} + set param {} + set close {} + unset state(cdata) + set state(mode) normal + } else { + # CDATA continues + append state(cdata) [subst -nocommand -novariable <$close$tag$param>$text] + continue + } + } + continue { + # We're skipping elements looking for the close tag + switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close { + 0,* { + continue + } + *,0, { + if {![string compare $tag $state(continue:tag)]} { + set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]] + if {![string length $empty]} { + incr state(continue:level) + } + } + continue + } + *,0,/ { + if {![string compare $tag $state(continue:tag)]} { + incr state(continue:level) -1 + } + if {!$state(continue:level)} { + unset state(continue:tag) + unset state(continue:level) + set state(mode) {} + } + } + default { + continue + } + } + } + default { + # The trailing slash on empty elements can't be automatically separated out + # in the RE, so we must do it here. + regexp (.*)(/)[cl $Wsp]*$ $param discard param empty + } + } + + # default: normal mode + + # Bug: if the attribute list has a right angle bracket then the empty + # element marker will not be seen + + set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]] + + switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close,$empty { + + 0,0,, { + # Ignore empty tag - dealt with non-normal mode above + } + *,0,, { + + # Start tag for an element. + + # Check if the internal DTD entity is in an attribute value + regsub -all &xml:intdtd\; $param \[$options(-internaldtd)\] param + + set code [catch {ParseEvent:ElementOpen $tag $param [array get options]} msg] + set state(haveDocElement) 1 + switch $code { + 0 {# OK} + 3 { + # break + return {} + } + 4 { + # continue + # Remember this tag and look for its close + set state(continue:tag) $tag + set state(continue:level) 1 + set state(mode) continue + continue + } + default { + return -code $code -errorinfo $::errorInfo $msg + } + } + + } + + *,0,/, { + + # End tag for an element. + + set code [catch {ParseEvent:ElementClose $tag [array get options]} msg] + switch $code { + 0 {# OK} + 3 { + # break + return {} + } + 4 { + # continue + # skip sibling nodes + set state(continue:tag) [lindex $state(stack) end] + set state(continue:level) 1 + set state(mode) continue + continue + } + default { + return -code $code -errorinfo $::errorInfo $msg + } + } + + } + + *,0,,/ { + + # Empty element + + # The trailing slash sneaks through into the param variable + regsub -all /[cl $::sgml::Wsp]*\$ $param {} param + + set code [catch {ParseEvent:ElementOpen $tag $param [array get options] -empty 1} msg] + set state(haveDocElement) 1 + switch $code { + 0 {# OK} + 3 { + # break + return {} + } + 4 { + # continue + # Pretty useless since it closes straightaway + } + default { + return -code $code -errorinfo $::errorInfo $msg + } + } + set code [catch {ParseEvent:ElementClose $tag [array get options] -empty 1} msg] + switch $code { + 0 {# OK} + 3 { + # break + return {} + } + 4 { + # continue + # skip sibling nodes + set state(continue:tag) [lindex $state(stack) end] + set state(continue:level) 1 + set state(mode) continue + continue + } + default { + return -code $code -errorinfo $::errorInfo $msg + } + } + + } + + *,1,* { + # Processing instructions or XML declaration + switch -glob -- $tag { + + {\?xml} { + # XML Declaration + if {$state(haveXMLDecl)} { + uplevel #0 $options(-errorcommand) [list illegalcharacter "unexpected characters \"<$tag\" around line $state(line)"] + } elseif {![regexp {\?$} $param]} { + uplevel #0 $options(-errorcommand) [list missingcharacters "XML Declaration missing characters \"?>\" around line $state(line)"] + } else { + + # We can do the parsing in one step with Tcl 8.1 RE's + # This has the benefit of performing better WF checking + + set adv_re [format {^[%s]*version[%s]*=[%s]*("|')(-+|[a-zA-Z0-9_.:]+)\1([%s]+encoding[%s]*=[%s]*("|')([A-Za-z][-A-Za-z0-9._]*)\4)?([%s]*standalone[%s]*=[%s]*("|')(yes|no)\7)?[%s]*\?$} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] + + if {[catch {regexp $adv_re $param discard delimiter version discard delimiter encoding discard delimiter standalone} matches]} { + # Otherwise we must fallback to 8.0. + # This won't detect certain well-formedness errors + + # Get the version number + if {[regexp [format {[%s]*version[%s]*=[%s]*"(-+|[a-zA-Z0-9_.:]+)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard version] || [regexp [format {[%s]*version[%s]*=[%s]*'(-+|[a-zA-Z0-9_.:]+)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard version]} { + if {[string compare $version "1.0"]} { + # Should we support future versions? + # At least 1.X? + uplevel #0 $options(-errorcommand) [list versionincompatibility "document XML version \"$version\" is incompatible with XML version 1.0"] + } + } else { + uplevel #0 $options(-errorcommand) [list missingversion "XML Declaration missing version information around line $state(line)"] + } + + # Get the encoding declaration + set encoding {} + regexp [format {[%s]*encoding[%s]*=[%s]*"([A-Za-z]([A-Za-z0-9._]|-)*)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard encoding + regexp [format {[%s]*encoding[%s]*=[%s]*'([A-Za-z]([A-Za-z0-9._]|-)*)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard encoding + + # Get the standalone declaration + set standalone {} + regexp [format {[%s]*standalone[%s]*=[%s]*"(yes|no)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard standalone + regexp [format {[%s]*standalone[%s]*=[%s]*'(yes|no)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard standalone + + # Invoke the callback + uplevel #0 $options(-xmldeclcommand) [list $version $encoding $standalone] + + } elseif {$matches == 0} { + uplevel #0 $options(-errorcommand) [list illformeddeclaration "XML Declaration not well-formed around line $state(line)"] + } else { + + # Invoke the callback + uplevel #0 $options(-xmldeclcommand) [list $version $encoding $standalone] + + } + + } + + } + + {\?*} { + # Processing instruction + set tag [string range $tag 1 end] + if {[regsub {\?$} $tag {} tag]} { + if {[string length [string trim $param]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$param\" in processing instruction around line $state(line)"] + } + } elseif {![regexp ^$Name\$ $tag]} { + uplevel #0 $options(-errorcommand) [list illegalcharacter "illegal character in processing instruction target \"$tag\""] + } elseif {[regexp {[xX][mM][lL]} $tag]} { + uplevel #0 $options(-errorcommand) [list illegalcharacters "characters \"xml\" not permitted in processing instruction target \"$tag\""] + } elseif {![regsub {\?$} $param {} param]} { + uplevel #0 $options(-errorcommand) [list missingquestion "PI: expected '?' character around line $state(line)"] + } + set code [catch {uplevel #0 $options(-processinginstructioncommand) [list $tag [string trimleft $param]]} msg] + switch $code { + 0 {# OK} + 3 { + # break + return {} + } + 4 { + # continue + # skip sibling nodes + set state(continue:tag) [lindex $state(stack) end] + set state(continue:level) 1 + set state(mode) continue + continue + } + default { + return -code $code -errorinfo $::errorInfo $msg + } + } + } + + !DOCTYPE { + # External entity reference + # This should move into xml.tcl + # Parse the params supplied. Looking for Name, ExternalID and MarkupDecl + set matched [regexp ^[cl $Wsp]*($Name)[cl $Wsp]*(.*) $param x state(doc_name) param] + set state(doc_name) [Normalize $state(doc_name) $options(-normalize)] + set externalID {} + set pubidlit {} + set systemlit {} + set externalID {} + if {[regexp -nocase ^[cl $Wsp]*(SYSTEM|PUBLIC)(.*) $param x id param]} { + switch [string toupper $id] { + SYSTEM { + if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} { + set externalID [list SYSTEM $systemlit] ;# " + } else { + uplevel #0 $options(-errorcommand) {syntaxerror {syntax error: SYSTEM identifier not followed by literal}} + } + } + PUBLIC { + if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x pubidlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x pubidlit param]} { + if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} { + set externalID [list PUBLIC $pubidlit $systemlit] + } else { + uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: PUBLIC identifier not followed by system literal around line $state(line)"] + } + } else { + uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: PUBLIC identifier not followed by literal around line $state(line)"] + } + } + } + if {[regexp -nocase ^[cl $Wsp]+NDATA[cl $Wsp]+($Name)(.*) $param x notation param]} { + lappend externalID $notation + } + } + + set state(inDTD) 1 + + ParseEvent:DocTypeDecl [array get options] $state(doc_name) $pubidlit $systemlit $options(-internaldtd) + + set state(inDTD) 0 + + } + + !--* { + + # Start of a comment + # See if it ends in the same tag, otherwise change the + # parsing mode + + regexp {!--(.*)} $tag discard comm1 + if {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $comm1 discard comm1_1]} { + # processed comment (end in tag) + uplevel #0 $options(-commentcommand) [list $comm1_1] + } elseif {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $param discard comm2]} { + # processed comment (end in attributes) + uplevel #0 $options(-commentcommand) [list $comm1$comm2] + } elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm2 text]} { + # processed comment (end in text) + uplevel #0 $options(-commentcommand) [list $comm1$param$empty>$comm2] + } else { + # start of comment + set state(mode) comment + set state(commentdata) "$comm1$param$empty>$text" + continue + } + } + + {!\[CDATA\[*} { + + regexp {!\[CDATA\[(.*)} $tag discard cdata1 + if {[regexp {(.*)]]$} $cdata1 discard cdata2]} { + # processed CDATA (end in tag) + PCDATA [array get options] [subst -novariable -nocommand $cdata2] + set text [subst -novariable -nocommand $text] + } elseif {[regexp {(.*)]]$} $param discard cdata2]} { + # processed CDATA (end in attribute) + # Backslashes in param are quoted at this stage + PCDATA [array get options] $cdata1[subst -novariable -nocommand $cdata2] + set text [subst -novariable -nocommand $text] + } elseif {[regexp {(.*)]]>(.*)} $text discard cdata2 text]} { + # processed CDATA (end in text) + # Backslashes in param and text are quoted at this stage + PCDATA [array get options] $cdata1[subst -novariable -nocommand $param]$empty>[subst -novariable -nocommand $cdata2] + set text [subst -novariable -nocommand $text] + } else { + # start CDATA + set state(cdata) "$cdata1$param>$text" + set state(mode) cdata + continue + } + + } + + !ELEMENT - + !ATTLIST - + !ENTITY - + !NOTATION { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "[string range $tag 1 end] declaration not expected in document instance around line $state(line)"] + } + + default { + uplevel #0 $options(-errorcommand) [list unknowninstruction "unknown processing instruction \"<$tag>\" around line $state(line)"] + } + } + } + *,1,* - + *,0,/,/ { + # Syntax error + uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: closed/empty tag: tag $tag param $param empty $empty close $close around line $state(line)"] + } + } + + # Process character data + + if {$state(haveDocElement) && [llength $state(stack)]} { + + # Check if the internal DTD entity is in the text + regsub -all &xml:intdtd\; $text \[$options(-internaldtd)\] text + + # Look for entity references + if {([array size entities] || \ + [string length $options(-entityreferencecommand)]) && \ + $options(-defaultexpandinternalentities) && \ + [regexp {&[^;]+;} $text]} { + + # protect Tcl specials + # NB. braces and backslashes may already be protected + regsub -all {\\({|}|\\)} $text {\1} text + regsub -all {([][$\\{}])} $text {\\\1} text + + # Mark entity references + regsub -all {&([^;]+);} $text [format {%s; %s {\1} ; %s %s} \}\} [namespace code [list Entity [array get options] $options(-entityreferencecommand) [namespace code [list PCDATA [array get options]]] $options(entities)]] [namespace code [list DeProtect [namespace code [list PCDATA [array get options]]]]] \{\{] text + set text "uplevel #0 [namespace code [list DeProtect1 [namespace code [list PCDATA [array get options]]]]] {{$text}}" + eval $text + } else { + + # Restore protected special characters + regsub -all {\\([][{}\\])} $text {\1} text + PCDATA [array get options] $text + } + } elseif {[string length [string trim $text]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\" in document prolog around line $state(line)"] + } + + } + + # If this is the end of the document, close all open containers + if {$options(-final) && [llength $state(stack)]} { + eval $options(-errorcommand) [list unclosedelement "element [lindex $state(stack) end] remains unclosed around line $state(line)"] + } + + return {} +} + +# sgml::DeProtect -- +# +# Invoke given command after removing protecting backslashes +# from given text. +# +# Arguments: +# cmd Command to invoke +# text Text to deprotect +# +# Results: +# Depends on command + +proc sgml::DeProtect1 {cmd text} { + if {[string compare {} $text]} { + regsub -all {\\([]$[{}\\])} $text {\1} text + uplevel #0 $cmd [list $text] + } +} +proc sgml::DeProtect {cmd text} { + set text [lindex $text 0] + if {[string compare {} $text]} { + regsub -all {\\([]$[{}\\])} $text {\1} text + uplevel #0 $cmd [list $text] + } +} + +# sgml::ParserDelete -- +# +# Free all memory associated with parser +# +# Arguments: +# var global state array +# +# Results: +# Variables unset + +proc sgml::ParserDelete var { + upvar #0 $var state + + if {![info exists state]} { + return -code error "unknown parser" + } + + catch {unset $state(entities)} + catch {unset $state(parameterentities)} + catch {unset $state(elementdecls)} + catch {unset $state(attlistdecls)} + catch {unset $state(notationdecls)} + catch {unset $state(namespaces)} + + unset state + + return {} +} + +# sgml::ParseEvent:ElementOpen -- +# +# Start of an element. +# +# Arguments: +# tag Element name +# attr Attribute list +# opts Options +# args further configuration options +# +# Options: +# -empty boolean +# indicates whether the element was an empty element +# +# Results: +# Modify state and invoke callback + +proc sgml::ParseEvent:ElementOpen {tag attr opts args} { + variable Name + variable Wsp + + array set options $opts + upvar #0 $options(-statevariable) state + array set cfg {-empty 0} + array set cfg $args + set handleEmpty 0 + + if {$options(-normalize)} { + set tag [string toupper $tag] + } + + # Update state + lappend state(stack) $tag + + # Parse attribute list into a key-value representation + if {[string compare $options(-parseattributelistcommand) {}]} { + if {[catch {uplevel #0 $options(-parseattributelistcommand) [list $opts $attr]} attr]} { + if {[string compare [lindex $attr 0] "unterminated attribute value"]} { + uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"] + set attr {} + } else { + + # It is most likely that a ">" character was in an attribute value. + # This manifests itself by ">" appearing in the element's text. + # In this case the callback should return a three element list; + # the message "unterminated attribute value", the attribute list it + # did manage to parse and the remainder of the attribute list. + + foreach {msg attlist brokenattr} $attr break + + upvar text elemText + if {[string first > $elemText] >= 0} { + + # Now piece the attribute list back together + regexp [cl $Wsp]*($Name)[cl $Wsp]*=[cl $Wsp]*("|')(.*) $brokenattr discard attname delimiter attvalue + regexp (.*)>([cl ^>]*)\$ $elemText discard remattlist elemText + regexp ([cl ^$delimiter]*)${delimiter}(.*) $remattlist discard remattvalue remattlist + + # Gotcha: watch out for empty element syntax + if {[string match */ [string trimright $remattlist]]} { + set remattlist [string range $remattlist 0 end-1] + set handleEmpty 1 + set cfg(-empty) 1 + } + + append attvalue >$remattvalue + lappend attlist $attname $attvalue + + # Complete parsing the attribute list + if {[catch {uplevel #0 $options(-parseattributelistcommand) [list $options(-statevariable) $remattlist]} attr]} { + uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"] + set attr {} + set attlist {} + } else { + eval lappend attlist $attr + } + + set attr $attlist + + } else { + uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"] + set attr {} + } + } + } + } + + set empty {} + if {$cfg(-empty) && $options(-reportempty)} { + set empty {-empty 1} + } + + # Check for namespace declarations + upvar #0 $options(namespaces) namespaces + set nsdecls {} + if {[llength $attr]} { + array set attrlist $attr + foreach {attrName attrValue} [array get attrlist xmlns*] { + unset attrlist($attrName) + set colon [set prefix {}] + if {[regexp {^xmlns(:(.+))?$} $attrName discard colon prefix]} { + switch -glob [string length $colon],[string length $prefix] { + 0,0 { + # default NS declaration + lappend state(defaultNSURI) $attrValue + lappend state(defaultNS) [llength $state(stack)] + lappend nsdecls $attrValue {} + } + 0,* { + # Huh? + } + *,0 { + # Error + uplevel #0 $state(-warningcommand) "no prefix specified for namespace URI \"$attrValue\" in element \"$tag\"" + } + default { + set namespaces($prefix,[llength $state(stack)]) $attrValue + lappend nsdecls $attrValue $prefix + } + } + } + } + if {[llength $nsdecls]} { + set nsdecls [list -namespacedecls $nsdecls] + } + set attr [array get attrlist] + } + + # Check whether this element has an expanded name + set ns {} + if {[regexp {([^:]+):(.*)$} $tag discard prefix tag]} { + set nsspec [lsort -dictionary -decreasing [array names namespaces $prefix,*]] + if {[llength $nsspec]} { + set nsuri $namespaces([lindex $nsspec 0]) + set ns [list -namespace $nsuri] + } else { + uplevel #0 $options(-errorcommand) [list namespaceundeclared "no namespace declared for prefix \"$prefix\" in element $tag"] + } + } elseif {[llength $state(defaultNSURI)]} { + set ns [list -namespace [lindex $state(defaultNSURI) end]] + } + + # Invoke callback + set code [catch {uplevel #0 $options(-elementstartcommand) [list $tag $attr] $empty $ns $nsdecls} msg] + + # Sometimes empty elements must be handled here (see above) + if {$code == 0 && $handleEmpty} { + ParseEvent:ElementClose $tag $opts -empty 1 + } + + return -code $code -errorinfo $::errorInfo $msg +} + +# sgml::ParseEvent:ElementClose -- +# +# End of an element. +# +# Arguments: +# tag Element name +# opts Options +# args further configuration options +# +# Options: +# -empty boolean +# indicates whether the element as an empty element +# +# Results: +# Modify state and invoke callback + +proc sgml::ParseEvent:ElementClose {tag opts args} { + array set options $opts + upvar #0 $options(-statevariable) state + array set cfg {-empty 0} + array set cfg $args + + # WF check + if {[string compare $tag [lindex $state(stack) end]]} { + uplevel #0 $options(-errorcommand) [list illegalendtag "end tag \"$tag\" does not match open element \"[lindex $state(stack) end]\" around line $state(line)"] + return + } + + # Check whether this element has an expanded name + upvar #0 $options(namespaces) namespaces + set ns {} + if {[regexp {([^:]+):(.*)$} $tag discard prefix tag]} { + set nsuri $namespaces([lindex [lsort -dictionary -decreasing [array names namespaces $prefix,*]] 0]) + set ns [list -namespace $nsuri] + } elseif {[llength $state(defaultNSURI)]} { + set ns [list -namespace [lindex $state(defaultNSURI) end]] + } + + # Pop namespace stacks, if any + if {[llength $state(defaultNS)]} { + if {[llength $state(stack)] == [lindex $state(defaultNS) end]} { + set state(defaultNS) [lreplace $state(defaultNS) end end] + } + } + foreach nsspec [array names namespaces *,[llength $state(stack)]] { + unset namespaces($nsspec) + } + + # Update state + set state(stack) [lreplace $state(stack) end end] + + set empty {} + if {$cfg(-empty) && $options(-reportempty)} { + set empty {-empty 1} + } + + # Invoke callback + # Mats: Shall be same as sgml::ParseEvent:ElementOpen to handle exceptions in callback. + set code [catch {uplevel #0 $options(-elementendcommand) [list $tag] $empty $ns} msg] + return -code $code -errorinfo $::errorInfo $msg +} + +# sgml::PCDATA -- +# +# Process PCDATA before passing to application +# +# Arguments: +# opts options +# pcdata Character data to be processed +# +# Results: +# Checks that characters are legal, +# checks -ignorewhitespace setting. + +proc sgml::PCDATA {opts pcdata} { + array set options $opts + + if {$options(-ignorewhitespace) && \ + ![string length [string trim $pcdata]]} { + return {} + } + + if {![regexp ^[cl $::sgml::Char]*\$ $pcdata]} { + upvar \#0 $options(-statevariable) state + uplevel \#0 $options(-errorcommand) [list illegalcharacters "illegal, non-Unicode characters found in text \"$pcdata\" around line $state(line)"] + } + + uplevel \#0 $options(-characterdatacommand) [list $pcdata] +} + +# sgml::Normalize -- +# +# Perform name normalization if required +# +# Arguments: +# name name to normalize +# req normalization required +# +# Results: +# Name returned as upper-case if normalization required + +proc sgml::Normalize {name req} { + if {$req} { + return [string toupper $name] + } else { + return $name + } +} + +# sgml::Entity -- +# +# Resolve XML entity references (syntax: &xxx;). +# +# Arguments: +# opts options +# entityrefcmd application callback for entity references +# pcdatacmd application callback for character data +# entities name of array containing entity definitions. +# ref entity reference (the "xxx" bit) +# +# Results: +# Returns substitution text for given entity. + +proc sgml::Entity {opts entityrefcmd pcdatacmd entities ref} { + array set options $opts + upvar #0 $options(-statevariable) state + + if {![string length $entities]} { + set entities [namespace current]::EntityPredef + } + + # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap + switch -glob -- $ref { + {%*} { + # Parameter entity - not recognised outside of a DTD + } + {#x*} { + # Character entity - hex + if {[catch {format %c [scan [string range $ref 2 end] %x tmp; set tmp]} char]} { + return -code error "malformed character entity \"$ref\"" + } + uplevel #0 $pcdatacmd [list $char] + + return {} + + } + {#*} { + # Character entity - decimal + if {[catch {format %c [scan [string range $ref 1 end] %d tmp; set tmp]} char]} { + return -code error "malformed character entity \"$ref\"" + } + uplevel #0 $pcdatacmd [list $char] + + return {} + + } + default { + # General entity + upvar #0 $entities map + if {[info exists map($ref)]} { + + if {![regexp {<|&} $map($ref)]} { + + # Simple text replacement - optimise + uplevel #0 $pcdatacmd [list $map($ref)] + + return {} + + } + + # Otherwise an additional round of parsing is required. + # This only applies to XML, since HTML doesn't have general entities + + # Must parse the replacement text for start & end tags, etc + # This text must be self-contained: balanced closing tags, and so on + + set tokenised [tokenise $map($ref) $::xml::tokExpr $::xml::substExpr] + set options(-final) 0 + eval parseEvent [list $tokenised] [array get options] + + return {} + + } elseif {[string compare $entityrefcmd "::sgml::noop"]} { + + set result [uplevel #0 $entityrefcmd [list $ref]] + + if {[string length $result]} { + uplevel #0 $pcdatacmd [list $result] + } + + return {} + + } else { + + # Reconstitute entity reference + + uplevel #0 $options(-errorcommand) [list illegalentity "undefined entity reference \"$ref\""] + + return {} + + } + } + } + + # If all else fails leave the entity reference untouched + uplevel #0 $pcdatacmd [list &$ref\;] + + return {} +} + +#################################### +# +# DTD parser for SGML (XML). +# +# This DTD actually only handles XML DTDs. Other language's +# DTD's, such as HTML, must be written in terms of a XML DTD. +# +#################################### + +# sgml::ParseEvent:DocTypeDecl -- +# +# Entry point for DTD parsing +# +# Arguments: +# opts configuration options +# docEl document element name +# pubId public identifier +# sysId system identifier (a URI) +# intSSet internal DTD subset + +proc sgml::ParseEvent:DocTypeDecl {opts docEl pubId sysId intSSet} { + array set options {} + array set options $opts + + set code [catch {uplevel #0 $options(-doctypecommand) [list $docEl $pubId $sysId $intSSet]} err] + switch $code { + 3 { + # break + return {} + } + 0 - + 4 { + # continue + } + default { + return -code $code $err + } + } + + # Otherwise we'll parse the DTD and report it piecemeal + + # The internal DTD subset is processed first (XML 2.8) + # During this stage, parameter entities are only allowed + # between markup declarations + + ParseDTD:Internal [array get options] $intSSet + + # The external DTD subset is processed last (XML 2.8) + # During this stage, parameter entities may occur anywhere + + # We must resolve the external identifier to obtain the + # DTD data. The application may supply its own resolver. + + if {[string length $pubId] || [string length $sysId]} { + uplevel #0 $options(-externalentitycommand) [list $options(-cmd) $options(-baseuri) $sysId $pubId] + } + + return {} +} + +# sgml::ParseDTD:Internal -- +# +# Parse the internal DTD subset. +# +# Parameter entities are only allowed between markup declarations. +# +# Arguments: +# opts configuration options +# dtd DTD data +# +# Results: +# Markup declarations parsed may cause callback invocation + +proc sgml::ParseDTD:Internal {opts dtd} { + variable MarkupDeclExpr + variable MarkupDeclSub + + array set options {} + array set options $opts + + upvar #0 $options(-statevariable) state + upvar #0 $options(parameterentities) PEnts + upvar #0 $options(externalparameterentities) ExtPEnts + + # Bug 583947: remove comments before further processing + regsub -all {<!--.*?-->} $dtd {} dtd + + # Tokenize the DTD + + # Protect Tcl special characters + regsub -all {([{}\\])} $dtd {\\\1} dtd + + regsub -all $MarkupDeclExpr $dtd $MarkupDeclSub dtd + + # Entities may have angle brackets in their replacement + # text, which breaks the RE processing. So, we must + # use a similar technique to processing doc instances + # to rebuild the declarations from the pieces + + set mode {} ;# normal + set delimiter {} + set name {} + set param {} + + set state(inInternalDTD) 1 + + # Process the tokens + foreach {decl value text} [lrange "{} {} \{$dtd\}" 3 end] { + + # Keep track of line numbers + incr state(line) [regsub -all \n $text {} discard] + + ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param + + ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode replText text param + + # There may be parameter entity references between markup decls + + if {[regexp {%.*;} $text]} { + + # Protect Tcl special characters + regsub -all {([{}\\])} $text {\\\1} text + + regsub -all %($::sgml::Name)\; $text "\} {\\1} \{" text + + set PElist "\{$text\}" + set PElist [lreplace $PElist end end] + foreach {text entref} $PElist { + if {[string length [string trim $text]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text in internal DTD subset around line $state(line)"] + } + + # Expand parameter entity and recursively parse + # BUG: no checks yet for recursive entity references + + if {[info exists PEnts($entref)]} { + set externalParser [$options(-cmd) entityparser] + $externalParser parse $PEnts($entref) -dtdsubset internal + } elseif {[info exists ExtPEnts($entref)]} { + set externalParser [$options(-cmd) entityparser] + $externalParser parse $ExtPEnts($entref) -dtdsubset external + #$externalParser free + } else { + uplevel #0 $options(-errorcommand) [list illegalreference "reference to undeclared parameter entity \"$entref\""] + } + } + + } + + } + + return {} +} + +# sgml::ParseDTD:EntityMode -- +# +# Perform special processing for various parser modes +# +# Arguments: +# opts configuration options +# modeVar pass-by-reference mode variable +# replTextVar pass-by-ref +# declVar pass-by-ref +# valueVar pass-by-ref +# textVar pass-by-ref +# delimiter delimiter currently in force +# name +# param +# +# Results: +# Depends on current mode + +proc sgml::ParseDTD:EntityMode {opts modeVar replTextVar declVar valueVar textVar delimiter name param} { + upvar 1 $modeVar mode + upvar 1 $replTextVar replText + upvar 1 $declVar decl + upvar 1 $valueVar value + upvar 1 $textVar text + array set options $opts + + switch $mode { + {} { + # Pass through to normal processing section + } + entity { + # Look for closing delimiter + if {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $decl discard val1 remainder]} { + append replText <$val1 + DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter + set decl / + set text $remainder\ $value>$text + set value {} + set mode {} + } elseif {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $value discard val2 remainder]} { + append replText <$decl\ $val2 + DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter + set decl / + set text $remainder>$text + set value {} + set mode {} + } elseif {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $text discard val3 remainder]} { + append replText <$decl\ $value>$val3 + DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter + set decl / + set text $remainder + set value {} + set mode {} + } else { + + # Remain in entity mode + append replText <$decl\ $value>$text + return -code continue + + } + } + + ignore { + upvar #0 $options(-statevariable) state + + if {[regexp {]](.*)$} $decl discard remainder]} { + set state(condSections) [lreplace $state(condSections) end end] + set decl $remainder + set mode {} + } elseif {[regexp {]](.*)$} $value discard remainder]} { + set state(condSections) [lreplace $state(condSections) end end] + regexp <[cl $::sgml::Wsp]*($::sgml::Name)(.*) $remainder discard decl value + set mode {} + } elseif {[regexp {]]>(.*)$} $text discard remainder]} { + set state(condSections) [lreplace $state(condSections) end end] + set decl / + set value {} + set text $remainder + #regexp <[cl $::sgml::Wsp]*($::sgml::Name)([cl ^>]*)>(.*) $remainder discard decl value text + set mode {} + } else { + set decl / + } + + } + + comment { + # Look for closing comment delimiter + + upvar #0 $options(-statevariable) state + + if {[regexp (.*?)--(.*)\$ $decl discard data1 remainder]} { + } elseif {[regexp (.*?)--(.*)\$ $value discard data1 remainder]} { + } elseif {[regexp (.*?)--(.*)\$ $text discard data1 remainder]} { + } else { + # comment continues + append state(commentdata) <$decl\ $value>$text + set decl / + set value {} + set text {} + } + } + + } + + return {} +} + +# sgml::ParseDTD:ProcessMarkupDecl -- +# +# Process a single markup declaration +# +# Arguments: +# opts configuration options +# declVar pass-by-ref +# valueVar pass-by-ref +# delimiterVar pass-by-ref for current delimiter in force +# nameVar pass-by-ref +# modeVar pass-by-ref for current parser mode +# replTextVar pass-by-ref +# textVar pass-by-ref +# paramVar pass-by-ref +# +# Results: +# Depends on markup declaration. May change parser mode + +proc sgml::ParseDTD:ProcessMarkupDecl {opts declVar valueVar delimiterVar nameVar modeVar replTextVar textVar paramVar} { + upvar 1 $modeVar mode + upvar 1 $replTextVar replText + upvar 1 $textVar text + upvar 1 $declVar decl + upvar 1 $valueVar value + upvar 1 $nameVar name + upvar 1 $delimiterVar delimiter + upvar 1 $paramVar param + + variable declExpr + variable ExternalEntityExpr + + array set options $opts + upvar #0 $options(-statevariable) state + + switch -glob -- $decl { + + / { + # continuation from entity processing + } + + !ELEMENT { + # Element declaration + if {[regexp $declExpr $value discard tag cmodel]} { + DTD:ELEMENT [array get options] $tag $cmodel + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed element declaration around line $state(line)"] + } + } + + !ATTLIST { + # Attribute list declaration + variable declExpr + if {[regexp $declExpr $value discard tag attdefns]} { + if {[catch {DTD:ATTLIST [array get options] $tag $attdefns} err]} { + #puts stderr "Stack trace: $::errorInfo\n***\n" + # Atttribute parsing has bugs at the moment + #return -code error "$err around line $state(line)" + return {} + } + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed attribute list declaration around line $state(line)"] + } + } + + !ENTITY { + # Entity declaration + variable EntityExpr + + if {[regexp $EntityExpr $value discard param name value]} { + + # Entity replacement text may have a '>' character. + # In this case, the real delimiter will be in the following + # text. This is complicated by the possibility of there + # being several '<','>' pairs in the replacement text. + # At this point, we are searching for the matching quote delimiter. + + if {[regexp $ExternalEntityExpr $value]} { + DTD:ENTITY [array get options] $name [string trim $param] $value + } elseif {[regexp ("|')(.*?)\\1(.*) $value discard delimiter replText value]} { + + if {[string length [string trim $value]]} { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"] + } else { + DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter + } + } elseif {[regexp ("|')(.*) $value discard delimiter replText]} { + append replText >$text + set text {} + set mode entity + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "no delimiter for entity declaration around line $state(line)"] + } + + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"] + } + } + + !NOTATION { + # Notation declaration + if {[regexp $declExpr param discard tag notation]} { + DTD:ENTITY [array get options] $tag $notation + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"] + } + } + + !--* { + # Start of a comment + + if {[regexp !--(.*?)--\$ $decl discard data]} { + if {[string length [string trim $value]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$value\""] + } + uplevel #0 $options(-commentcommand) [list $data] + set decl / + set value {} + } elseif {[regexp -- ^(.*?)--\$ $value discard data2]} { + regexp !--(.*)\$ $decl discard data1 + uplevel #0 $options(-commentcommand) [list $data1\ $data2] + set decl / + set value {} + } elseif {[regexp (.*?)-->(.*)\$ $text discard data3 remainder]} { + regexp !--(.*)\$ $decl discard data1 + uplevel #0 $options(-commentcommand) [list $data1\ $value>$data3] + set decl / + set value {} + set text $remainder + } else { + regexp !--(.*)\$ $decl discard data1 + set state(commentdata) $data1\ $value>$text + set decl / + set value {} + set text {} + set mode comment + } + } + + !*INCLUDE* - + !*IGNORE* { + if {$state(inInternalDTD)} { + uplevel #0 $options(-errorcommand) [list illegalsection "conditional section not permitted in internal DTD subset around line $state(line)"] + } + + if {[regexp {^!\[INCLUDE\[(.*)} $decl discard remainder]} { + # Push conditional section stack, popped by ]]> sequence + + if {[regexp {(.*?)]]$} $remainder discard r2]} { + # section closed immediately + if {[string length [string trim $r2]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"] + } + } elseif {[regexp {(.*?)]](.*)} $value discard r2 r3]} { + # section closed immediately + if {[string length [string trim $r2]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"] + } + if {[string length [string trim $r3]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r3\" in conditional section"] + } + } else { + + lappend state(condSections) INCLUDE + + set parser [$options(-cmd) entityparser] + $parser parse $remainder\ $value> -dtdsubset external + #$parser free + + if {[regexp {(.*?)]]>(.*)} $text discard t1 t2]} { + if {[string length [string trim $t1]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""] + } + if {![llength $state(condSections)]} { + uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"] + } + set state(condSections) [lreplace $state(condSections) end end] + set text $t2 + } + + } + } elseif {[regexp {^!\[IGNORE\[(.*)} $decl discard remainder]} { + # Set ignore mode. Still need a stack + set mode ignore + + if {[regexp {(.*?)]]$} $remainder discard r2]} { + # section closed immediately + if {[string length [string trim $r2]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"] + } + } elseif {[regexp {(.*?)]](.*)} $value discard r2 r3]} { + # section closed immediately + if {[string length [string trim $r2]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"] + } + if {[string length [string trim $r3]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r3\" in conditional section"] + } + } else { + + lappend state(condSections) IGNORE + + if {[regexp {(.*?)]]>(.*)} $text discard t1 t2]} { + if {[string length [string trim $t1]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""] + } + if {![llength $state(condSections)]} { + uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"] + } + set state(condSections) [lreplace $state(condSections) end end] + set text $t2 + } + + } + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "illegal markup declaration \"$decl\" around line $state(line)"] + } + + } + + default { + if {[regexp {^\?(.*)} $decl discard target]} { + # Processing instruction + } else { + uplevel #0 $options(-errorcommand) [list illegaldeclaration "illegal markup declaration \"$decl\""] + } + } + } + + return {} +} + +# sgml::ParseDTD:External -- +# +# Parse the external DTD subset. +# +# Parameter entities are allowed anywhere. +# +# Arguments: +# opts configuration options +# dtd DTD data +# +# Results: +# Markup declarations parsed may cause callback invocation + +proc sgml::ParseDTD:External {opts dtd} { + variable MarkupDeclExpr + variable MarkupDeclSub + variable declExpr + + array set options $opts + upvar #0 $options(parameterentities) PEnts + upvar #0 $options(externalparameterentities) ExtPEnts + upvar #0 $options(-statevariable) state + + # As with the internal DTD subset, watch out for + # entities with angle brackets + set mode {} ;# normal + set delimiter {} + set name {} + set param {} + + set oldState 0 + catch {set oldState $state(inInternalDTD)} + set state(inInternalDTD) 0 + + # Initialise conditional section stack + if {![info exists state(condSections)]} { + set state(condSections) {} + } + set startCondSectionDepth [llength $state(condSections)] + + while {[string length $dtd]} { + set progress 0 + set PEref {} + if {![string compare $mode "ignore"]} { + set progress 1 + if {[regexp {]]>(.*)} $dtd discard dtd]} { + set remainder {} + set mode {} ;# normal + set state(condSections) [lreplace $state(condSections) end end] + continue + } else { + uplevel #0 $options(-errorcommand) [list missingdelimiter "IGNORE conditional section closing delimiter not found"] + } + } elseif {[regexp ^(.*?)%($::sgml::Name)\;(.*)\$ $dtd discard data PEref remainder]} { + set progress 1 + } else { + set data $dtd + set dtd {} + set remainder {} + } + + # Tokenize the DTD (so far) + + # Protect Tcl special characters + regsub -all {([{}\\])} $data {\\\1} dataP + + set n [regsub -all $MarkupDeclExpr $dataP $MarkupDeclSub dataP] + + if {$n} { + set progress 1 + # All but the last markup declaration should have no text + set dataP [lrange "{} {} \{$dataP\}" 3 end] + if {[llength $dataP] > 3} { + foreach {decl value text} [lrange $dataP 0 [expr [llength $dataP] - 4]] { + ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param + ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode repltextVar text param + + if {[string length [string trim $text]]} { + # check for conditional section close + if {[regexp {]]>(.*)$} $text discard text]} { + if {[string length [string trim $text]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\""] + } + if {![llength $state(condSections)]} { + uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"] + } + set state(condSections) [lreplace $state(condSections) end end] + if {![string compare $mode "ignore"]} { + set mode {} ;# normal + } + } else { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\""] + } + } + } + } + # Do the last declaration + foreach {decl value text} [lrange $dataP [expr [llength $dataP] - 3] end] { + ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param + ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode repltextVar text param + } + } + + # Now expand the PE reference, if any + switch -glob $mode,[string length $PEref],$n { + ignore,0,* { + set dtd $text + } + ignore,*,* { + set dtd $text$remainder + } + *,0,0 { + set dtd $data + } + *,0,* { + set dtd $text + } + *,*,0 { + if {[catch {append data $PEnts($PEref)}]} { + if {[info exists ExtPEnts($PEref)]} { + set externalParser [$options(-cmd) entityparser] + $externalParser parse $ExtPEnts($PEref) -dtdsubset external + #$externalParser free + } else { + uplevel #0 $options(-errorcommand) [list entityundeclared "parameter entity \"$PEref\" not declared"] + } + } + set dtd $data$remainder + } + default { + if {[catch {append text $PEnts($PEref)}]} { + if {[info exists ExtPEnts($PEref)]} { + set externalParser [$options(-cmd) entityparser] + $externalParser parse $ExtPEnts($PEref) -dtdsubset external + #$externalParser free + } else { + uplevel #0 $options(-errorcommand) [list entityundeclared "parameter entity \"$PEref\" not declared"] + } + } + set dtd $text$remainder + } + } + + # Check whether a conditional section has been terminated + if {[regexp {^(.*?)]]>(.*)$} $dtd discard t1 t2]} { + if {![regexp <.*> $t1]} { + if {[string length [string trim $t1]]} { + uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""] + } + if {![llength $state(condSections)]} { + uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"] + } + set state(condSections) [lreplace $state(condSections) end end] + if {![string compare $mode "ignore"]} { + set mode {} ;# normal + } + set dtd $t2 + set progress 1 + } + } + + if {!$progress} { + # No parameter entity references were found and + # the text does not contain a well-formed markup declaration + # Avoid going into an infinite loop + upvar #0 $options(-errorcommand) [list syntaxerror "external entity does not contain well-formed markup declaration"] + break + } + } + + set state(inInternalDTD) $oldState + + # Check that conditional sections have been closed properly + if {[llength $state(condSections)] > $startCondSectionDepth} { + uplevel #0 $options(-errorcommand) [list syntaxerror "[lindex $state(condSections) end] conditional section not closed"] + } + if {[llength $state(condSections)] < $startCondSectionDepth} { + uplevel #0 $options(-errorcommand) [list syntaxerror "too many conditional section closures"] + } + + return {} +} + +# Procedures for handling the various declarative elements in a DTD. +# New elements may be added by creating a procedure of the form +# parse:DTD:_element_ + +# For each of these procedures, the various regular expressions they use +# are created outside of the proc to avoid overhead at runtime + +# sgml::DTD:ELEMENT -- +# +# <!ELEMENT ...> defines an element. +# +# The content model for the element is stored in the contentmodel array, +# indexed by the element name. The content model is parsed into the +# following list form: +# +# {} Content model is EMPTY. +# Indicated by an empty list. +# * Content model is ANY. +# Indicated by an asterix. +# {ELEMENT ...} +# Content model is element-only. +# {MIXED {element1 element2 ...}} +# Content model is mixed (PCDATA and elements). +# The second element of the list contains the +# elements that may occur. #PCDATA is assumed +# (ie. the list is normalised). +# +# Arguments: +# opts configuration options +# name element GI +# modspec unparsed content model specification + +proc sgml::DTD:ELEMENT {opts name modspec} { + variable Wsp + array set options $opts + + upvar #0 $options(elementdecls) elements + + if {$options(-validate) && [info exists elements($name)]} { + eval $options(-errorcommand) [list elementdeclared "element \"$name\" already declared"] + } else { + switch -- $modspec { + EMPTY { + set elements($name) {} + uplevel #0 $options(-elementdeclcommand) $name {{}} + } + ANY { + set elements($name) * + uplevel #0 $options(-elementdeclcommand) $name * + } + default { + # Don't parse the content model for now, + # just pass the model to the application + if {0 && [regexp [format {^\([%s]*#PCDATA[%s]*(\|([^)]+))?[%s]*\)*[%s]*$} $Wsp $Wsp $Wsp $Wsp] discard discard mtoks]} { + set cm($name) [list MIXED [split $mtoks |]] + } elseif {0} { + if {[catch {CModelParse $state(state) $value} result]} { + eval $options(-errorcommand) [list element? $result] + } else { + set cm($id) [list ELEMENT $result] + } + } else { + set elements($name) $modspec + uplevel #0 $options(-elementdeclcommand) $name [list $modspec] + } + } + } + } +} + +# sgml::CModelParse -- +# +# Parse an element content model (non-mixed). +# A syntax tree is constructed. +# A transition table is built next. +# +# This is going to need alot of work! +# +# Arguments: +# state state array variable +# value the content model data +# +# Results: +# A Tcl list representing the content model. + +proc sgml::CModelParse {state value} { + upvar #0 $state var + + # First build syntax tree + set syntaxTree [CModelMakeSyntaxTree $state $value] + + # Build transition table + set transitionTable [CModelMakeTransitionTable $state $syntaxTree] + + return [list $syntaxTree $transitionTable] +} + +# sgml::CModelMakeSyntaxTree -- +# +# Construct a syntax tree for the regular expression. +# +# Syntax tree is represented as a Tcl list: +# rep {:choice|:seq {{rep list1} {rep list2} ...}} +# where: rep is repetition character, *, + or ?. {} for no repetition +# listN is nested expression or Name +# +# Arguments: +# spec Element specification +# +# Results: +# Syntax tree for element spec as nested Tcl list. +# +# Examples: +# (memo) +# {} {:seq {{} memo}} +# (front, body, back?) +# {} {:seq {{} front} {{} body} {? back}} +# (head, (p | list | note)*, div2*) +# {} {:seq {{} head} {* {:choice {{} p} {{} list} {{} note}}} {* div2}} +# (p | a | ul)+ +# + {:choice {{} p} {{} a} {{} ul}} + +proc sgml::CModelMakeSyntaxTree {state spec} { + upvar #0 $state var + variable Wsp + variable name + + # Translate the spec into a Tcl list. + + # None of the Tcl special characters are allowed in a content model spec. + if {[regexp {\$|\[|\]|\{|\}} $spec]} { + return -code error "illegal characters in specification" + } + + regsub -all [format {(%s)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $name $Wsp $Wsp] $spec [format {%sCModelSTname %s {\1} {\2} {\3}} \n $state] spec + regsub -all {\(} $spec "\nCModelSTopenParen $state " spec + regsub -all [format {\)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $Wsp $Wsp] $spec [format {%sCModelSTcloseParen %s {\1} {\2}} \n $state] spec + + array set var {stack {} state start} + eval $spec + + # Peel off the outer seq, its redundant + return [lindex [lindex $var(stack) 1] 0] +} + +# sgml::CModelSTname -- +# +# Processes a name in a content model spec. +# +# Arguments: +# state state array variable +# name name specified +# rep repetition operator +# cs choice or sequence delimiter +# +# Results: +# See CModelSTcp. + +proc sgml::CModelSTname {state name rep cs args} { + if {[llength $args]} { + return -code error "syntax error in specification: \"$args\"" + } + + CModelSTcp $state $name $rep $cs +} + +# sgml::CModelSTcp -- +# +# Process a content particle. +# +# Arguments: +# state state array variable +# name name specified +# rep repetition operator +# cs choice or sequence delimiter +# +# Results: +# The content particle is added to the current group. + +proc sgml::CModelSTcp {state cp rep cs} { + upvar #0 $state var + + switch -glob -- [lindex $var(state) end]=$cs { + start= { + set var(state) [lreplace $var(state) end end end] + # Add (dummy) grouping, either choice or sequence will do + CModelSTcsSet $state , + CModelSTcpAdd $state $cp $rep + } + :choice= - + :seq= { + set var(state) [lreplace $var(state) end end end] + CModelSTcpAdd $state $cp $rep + } + start=| - + start=, { + set var(state) [lreplace $var(state) end end [expr {$cs == "," ? ":seq" : ":choice"}]] + CModelSTcsSet $state $cs + CModelSTcpAdd $state $cp $rep + } + :choice=| - + :seq=, { + CModelSTcpAdd $state $cp $rep + } + :choice=, - + :seq=| { + return -code error "syntax error in specification: incorrect delimiter after \"$cp\", should be \"[expr {$cs == "," ? "|" : ","}]\"" + } + end=* { + return -code error "syntax error in specification: no delimiter before \"$cp\"" + } + default { + return -code error "syntax error" + } + } + +} + +# sgml::CModelSTcsSet -- +# +# Start a choice or sequence on the stack. +# +# Arguments: +# state state array +# cs choice oir sequence +# +# Results: +# state is modified: end element of state is appended. + +proc sgml::CModelSTcsSet {state cs} { + upvar #0 $state var + + set cs [expr {$cs == "," ? ":seq" : ":choice"}] + + if {[llength $var(stack)]} { + set var(stack) [lreplace $var(stack) end end $cs] + } else { + set var(stack) [list $cs {}] + } +} + +# sgml::CModelSTcpAdd -- +# +# Append a content particle to the top of the stack. +# +# Arguments: +# state state array +# cp content particle +# rep repetition +# +# Results: +# state is modified: end element of state is appended. + +proc sgml::CModelSTcpAdd {state cp rep} { + upvar #0 $state var + + if {[llength $var(stack)]} { + set top [lindex $var(stack) end] + lappend top [list $rep $cp] + set var(stack) [lreplace $var(stack) end end $top] + } else { + set var(stack) [list $rep $cp] + } +} + +# sgml::CModelSTopenParen -- +# +# Processes a '(' in a content model spec. +# +# Arguments: +# state state array +# +# Results: +# Pushes stack in state array. + +proc sgml::CModelSTopenParen {state args} { + upvar #0 $state var + + if {[llength $args]} { + return -code error "syntax error in specification: \"$args\"" + } + + lappend var(state) start + lappend var(stack) [list {} {}] +} + +# sgml::CModelSTcloseParen -- +# +# Processes a ')' in a content model spec. +# +# Arguments: +# state state array +# rep repetition +# cs choice or sequence delimiter +# +# Results: +# Stack is popped, and former top of stack is appended to previous element. + +proc sgml::CModelSTcloseParen {state rep cs args} { + upvar #0 $state var + + if {[llength $args]} { + return -code error "syntax error in specification: \"$args\"" + } + + set cp [lindex $var(stack) end] + set var(stack) [lreplace $var(stack) end end] + set var(state) [lreplace $var(state) end end] + CModelSTcp $state $cp $rep $cs +} + +# sgml::CModelMakeTransitionTable -- +# +# Given a content model's syntax tree, constructs +# the transition table for the regular expression. +# +# See "Compilers, Principles, Techniques, and Tools", +# Aho, Sethi and Ullman. Section 3.9, algorithm 3.5. +# +# Arguments: +# state state array variable +# st syntax tree +# +# Results: +# The transition table is returned, as a key/value Tcl list. + +proc sgml::CModelMakeTransitionTable {state st} { + upvar #0 $state var + + # Construct nullable, firstpos and lastpos functions + array set var {number 0} + foreach {nullable firstpos lastpos} [ \ + TraverseDepth1st $state $st { + # Evaluated for leaf nodes + # Compute nullable(n) + # Compute firstpos(n) + # Compute lastpos(n) + set nullable [nullable leaf $rep $name] + set firstpos [list {} $var(number)] + set lastpos [list {} $var(number)] + set var(pos:$var(number)) $name + } { + # Evaluated for nonterminal nodes + # Compute nullable, firstpos, lastpos + set firstpos [firstpos $cs $firstpos $nullable] + set lastpos [lastpos $cs $lastpos $nullable] + set nullable [nullable nonterm $rep $cs $nullable] + } \ + ] break + + set accepting [incr var(number)] + set var(pos:$accepting) # + + # var(pos:N) maps from position to symbol. + # Construct reverse map for convenience. + # NB. A symbol may appear in more than one position. + # var is about to be reset, so use different arrays. + + foreach {pos symbol} [array get var pos:*] { + set pos [lindex [split $pos :] 1] + set pos2symbol($pos) $symbol + lappend sym2pos($symbol) $pos + } + + # Construct the followpos functions + catch {unset var} + followpos $state $st $firstpos $lastpos + + # Construct transition table + # Dstates is [union $marked $unmarked] + set unmarked [list [lindex $firstpos 1]] + while {[llength $unmarked]} { + set T [lindex $unmarked 0] + lappend marked $T + set unmarked [lrange $unmarked 1 end] + + # Find which input symbols occur in T + set symbols {} + foreach pos $T { + if {$pos != $accepting && [lsearch $symbols $pos2symbol($pos)] < 0} { + lappend symbols $pos2symbol($pos) + } + } + foreach a $symbols { + set U {} + foreach pos $sym2pos($a) { + if {[lsearch $T $pos] >= 0} { + # add followpos($pos) + if {$var($pos) == {}} { + lappend U $accepting + } else { + eval lappend U $var($pos) + } + } + } + set U [makeSet $U] + if {[llength $U] && [lsearch $marked $U] < 0 && [lsearch $unmarked $U] < 0} { + lappend unmarked $U + } + set Dtran($T,$a) $U + } + + } + + return [list [array get Dtran] [array get sym2pos] $accepting] +} + +# sgml::followpos -- +# +# Compute the followpos function, using the already computed +# firstpos and lastpos. +# +# Arguments: +# state array variable to store followpos functions +# st syntax tree +# firstpos firstpos functions for the syntax tree +# lastpos lastpos functions +# +# Results: +# followpos functions for each leaf node, in name/value format + +proc sgml::followpos {state st firstpos lastpos} { + upvar #0 $state var + + switch -- [lindex [lindex $st 1] 0] { + :seq { + for {set i 1} {$i < [llength [lindex $st 1]]} {incr i} { + followpos $state [lindex [lindex $st 1] $i] \ + [lindex [lindex $firstpos 0] [expr $i - 1]] \ + [lindex [lindex $lastpos 0] [expr $i - 1]] + foreach pos [lindex [lindex [lindex $lastpos 0] [expr $i - 1]] 1] { + eval lappend var($pos) [lindex [lindex [lindex $firstpos 0] $i] 1] + set var($pos) [makeSet $var($pos)] + } + } + } + :choice { + for {set i 1} {$i < [llength [lindex $st 1]]} {incr i} { + followpos $state [lindex [lindex $st 1] $i] \ + [lindex [lindex $firstpos 0] [expr $i - 1]] \ + [lindex [lindex $lastpos 0] [expr $i - 1]] + } + } + default { + # No action at leaf nodes + } + } + + switch -- [lindex $st 0] { + ? { + # We having nothing to do here ! Doing the same as + # for * effectively converts this qualifier into the other. + } + * { + foreach pos [lindex $lastpos 1] { + eval lappend var($pos) [lindex $firstpos 1] + set var($pos) [makeSet $var($pos)] + } + } + } + +} + +# sgml::TraverseDepth1st -- +# +# Perform depth-first traversal of a tree. +# A new tree is constructed, with each node computed by f. +# +# Arguments: +# state state array variable +# t The tree to traverse, a Tcl list +# leaf Evaluated at a leaf node +# nonTerm Evaluated at a nonterminal node +# +# Results: +# A new tree is returned. + +proc sgml::TraverseDepth1st {state t leaf nonTerm} { + upvar #0 $state var + + set nullable {} + set firstpos {} + set lastpos {} + + switch -- [lindex [lindex $t 1] 0] { + :seq - + :choice { + set rep [lindex $t 0] + set cs [lindex [lindex $t 1] 0] + + foreach child [lrange [lindex $t 1] 1 end] { + foreach {childNullable childFirstpos childLastpos} \ + [TraverseDepth1st $state $child $leaf $nonTerm] break + lappend nullable $childNullable + lappend firstpos $childFirstpos + lappend lastpos $childLastpos + } + + eval $nonTerm + } + default { + incr var(number) + set rep [lindex [lindex $t 0] 0] + set name [lindex [lindex $t 1] 0] + eval $leaf + } + } + + return [list $nullable $firstpos $lastpos] +} + +# sgml::firstpos -- +# +# Computes the firstpos function for a nonterminal node. +# +# Arguments: +# cs node type, choice or sequence +# firstpos firstpos functions for the subtree +# nullable nullable functions for the subtree +# +# Results: +# firstpos function for this node is returned. + +proc sgml::firstpos {cs firstpos nullable} { + switch -- $cs { + :seq { + set result [lindex [lindex $firstpos 0] 1] + for {set i 0} {$i < [llength $nullable]} {incr i} { + if {[lindex [lindex $nullable $i] 1]} { + eval lappend result [lindex [lindex $firstpos [expr $i + 1]] 1] + } else { + break + } + } + } + :choice { + foreach child $firstpos { + eval lappend result $child + } + } + } + + return [list $firstpos [makeSet $result]] +} + +# sgml::lastpos -- +# +# Computes the lastpos function for a nonterminal node. +# Same as firstpos, only logic is reversed +# +# Arguments: +# cs node type, choice or sequence +# lastpos lastpos functions for the subtree +# nullable nullable functions forthe subtree +# +# Results: +# lastpos function for this node is returned. + +proc sgml::lastpos {cs lastpos nullable} { + switch -- $cs { + :seq { + set result [lindex [lindex $lastpos end] 1] + for {set i [expr [llength $nullable] - 1]} {$i >= 0} {incr i -1} { + if {[lindex [lindex $nullable $i] 1]} { + eval lappend result [lindex [lindex $lastpos $i] 1] + } else { + break + } + } + } + :choice { + foreach child $lastpos { + eval lappend result $child + } + } + } + + return [list $lastpos [makeSet $result]] +} + +# sgml::makeSet -- +# +# Turn a list into a set, ie. remove duplicates. +# +# Arguments: +# s a list +# +# Results: +# A set is returned, which is a list with duplicates removed. + +proc sgml::makeSet s { + foreach r $s { + if {[llength $r]} { + set unique($r) {} + } + } + return [array names unique] +} + +# sgml::nullable -- +# +# Compute the nullable function for a node. +# +# Arguments: +# nodeType leaf or nonterminal +# rep repetition applying to this node +# name leaf node: symbol for this node, nonterm node: choice or seq node +# subtree nonterm node: nullable functions for the subtree +# +# Results: +# Returns nullable function for this branch of the tree. + +proc sgml::nullable {nodeType rep name {subtree {}}} { + switch -glob -- $rep:$nodeType { + :leaf - + +:leaf { + return [list {} 0] + } + \\*:leaf - + \\?:leaf { + return [list {} 1] + } + \\*:nonterm - + \\?:nonterm { + return [list $subtree 1] + } + :nonterm - + +:nonterm { + switch -- $name { + :choice { + set result 0 + foreach child $subtree { + set result [expr $result || [lindex $child 1]] + } + } + :seq { + set result 1 + foreach child $subtree { + set result [expr $result && [lindex $child 1]] + } + } + } + return [list $subtree $result] + } + } +} + +# sgml::DTD:ATTLIST -- +# +# <!ATTLIST ...> defines an attribute list. +# +# Arguments: +# opts configuration opions +# name Element GI +# attspec unparsed attribute definitions +# +# Results: +# Attribute list variables are modified. + +proc sgml::DTD:ATTLIST {opts name attspec} { + variable attlist_exp + variable attlist_enum_exp + variable attlist_fixed_exp + + array set options $opts + + # Parse the attribute list. If it were regular, could just use foreach, + # but some attributes may have values. + regsub -all {([][$\\])} $attspec {\\\1} attspec + regsub -all $attlist_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {\\3} {} \{" attspec + regsub -all $attlist_enum_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {} {\\4} \{" attspec + regsub -all $attlist_fixed_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {\\3} {\\4} \{" attspec + + eval "noop \{$attspec\}" + + return {} +} + +# sgml::DTDAttribute -- +# +# Parse definition of a single attribute. +# +# Arguments: +# callback attribute defn callback +# name element name +# var array variable +# att attribute name +# type type of this attribute +# default default value of the attribute +# value other information +# text other text (should be empty) +# +# Results: +# Attribute defn added to array, unless it already exists + +proc sgml::DTDAttribute args { + # BUG: Some problems with parameter passing - deal with it later + foreach {callback name var att type default value text} $args break + + upvar #0 $var atts + + if {[string length [string trim $text]]} { + return -code error "unexpected text \"$text\" in attribute definition" + } + + # What about overridden attribute defns? + # A non-validating app may want to know about them + # (eg. an editor) + if {![info exists atts($name/$att)]} { + set atts($name/$att) [list $type $default $value] + uplevel #0 $callback [list $name $att $type $default $value] + } + + return {} +} + +# sgml::DTD:ENTITY -- +# +# <!ENTITY ...> declaration. +# +# Callbacks: +# -entitydeclcommand for general entity declaration +# -unparsedentitydeclcommand for unparsed external entity declaration +# -parameterentitydeclcommand for parameter entity declaration +# +# Arguments: +# opts configuration options +# name name of entity being defined +# param whether a parameter entity is being defined +# value unparsed replacement text +# +# Results: +# Modifies the caller's entities array variable + +proc sgml::DTD:ENTITY {opts name param value} { + + array set options $opts + + if {[string compare % $param]} { + # Entity declaration - general or external + upvar #0 $options(entities) ents + upvar #0 $options(extentities) externals + + if {[info exists ents($name)] || [info exists externals($name)]} { + eval $options(-warningcommand) entity [list "entity \"$name\" already declared"] + } else { + if {[catch {uplevel #0 $options(-parseentitydeclcommand) [list $value]} value]} { + return -code error "unable to parse entity declaration due to \"$value\"" + } + switch -glob [lindex $value 0],[lindex $value 3] { + internal, { + set ents($name) [EntitySubst [array get options] [lindex $value 1]] + uplevel #0 $options(-entitydeclcommand) [list $name $ents($name)] + } + internal,* { + return -code error "unexpected NDATA declaration" + } + external, { + set externals($name) [lrange $value 1 2] + uplevel #0 $options(-entitydeclcommand) [eval list $name [lrange $value 1 2]] + } + external,* { + set externals($name) [lrange $value 1 3] + uplevel #0 $options(-unparsedentitydeclcommand) [eval list $name [lrange $value 1 3]] + } + default { + return -code error "internal error: unexpected parser state" + } + } + } + } else { + # Parameter entity declaration + upvar #0 $options(parameterentities) PEnts + upvar #0 $options(externalparameterentities) ExtPEnts + + if {[info exists PEnts($name)] || [info exists ExtPEnts($name)]} { + eval $options(-warningcommand) parameterentity [list "parameter entity \"$name\" already declared"] + } else { + if {[catch {uplevel #0 $options(-parseentitydeclcommand) [list $value]} value]} { + return -code error "unable to parse parameter entity declaration due to \"$value\"" + } + if {[string length [lindex $value 3]]} { + return -code error "NDATA illegal in parameter entity declaration" + } + switch [lindex $value 0] { + internal { + # Substitute character references and PEs (XML: 4.5) + set value [EntitySubst [array get options] [lindex $value 1]] + + set PEnts($name) $value + uplevel #0 $options(-parameterentitydeclcommand) [list $name $value] + } + external - + default { + # Get the replacement text now. + # Could wait until the first reference, but easier + # to just do it now. + + set token [uri::geturl [uri::resolve $options(-baseuri) [lindex $value 1]]] + + set ExtPEnts($name) [lindex [array get $token data] 1] + uplevel #0 $options(-parameterentitydeclcommand) [eval list $name [lrange $value 1 2]] + } + } + } + } +} + +# sgml::EntitySubst -- +# +# Perform entity substitution on an entity replacement text. +# This differs slightly from other substitution procedures, +# because only parameter and character entity substitution +# is performed, not general entities. +# See XML Rec. section 4.5. +# +# Arguments: +# opts configuration options +# value Literal entity value +# +# Results: +# Expanded replacement text + +proc sgml::EntitySubst {opts value} { + array set options $opts + + # Protect Tcl special characters + regsub -all {([{}\\])} $value {\\\1} value + + # Find entity references + regsub -all (&#\[0-9\]+|&#x\[0-9a-fA-F\]+|%${::sgml::Name})\; $value "\[EntitySubstValue [list $options(parameterentities)] {\\1}\]" value + + set result [subst $value] + + return $result +} + +# sgml::EntitySubstValue -- +# +# Handle a single character or parameter entity substitution +# +# Arguments: +# PEvar array variable containing PE declarations +# ref character or parameter entity reference +# +# Results: +# Replacement text + +proc sgml::EntitySubstValue {PEvar ref} { + # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap + switch -glob -- $ref { + {&#x*} { + scan [string range $ref 3 end] %x hex + return [format %c $hex] + } + {&#*} { + return [format %c [string range $ref 2 end]] + } + {%*} { + upvar #0 $PEvar PEs + set ref [string range $ref 1 end] + if {[info exists PEs($ref)]} { + return $PEs($ref) + } else { + return -code error "parameter entity \"$ref\" not declared" + } + } + default { + return -code error "internal error - unexpected entity reference" + } + } + return {} +} + +# sgml::DTD:NOTATION -- +# +# Process notation declaration +# +# Arguments: +# opts configuration options +# name notation name +# value unparsed notation spec + +proc sgml::DTD:NOTATION {opts name value} { + return {} + + variable notation_exp + upvar opts state + + if {[regexp $notation_exp $value x scheme data] == 2} { + } else { + eval $state(-errorcommand) [list notationvalue "notation value \"$value\" incorrectly specified"] + } +} + +# sgml::ResolveEntity -- +# +# Default entity resolution routine +# +# Arguments: +# cmd command of parent parser +# base base URL for relative URLs +# sysId system identifier +# pubId public identifier + +proc sgml::ResolveEntity {cmd base sysId pubId} { + variable ParseEventNum + + if {[catch {uri::resolve $base $sysId} url]} { + return -code error "unable to resolve system identifier \"$sysId\"" + } + if {[catch {uri::geturl $url} token]} { + return -code error "unable to retrieve external entity \"$url\" for system identifier \"$sysId\"" + } + + upvar #0 $token data + + set parser [uplevel #0 $cmd entityparser] + + set body {} + catch {set body $data(body)} + catch {set body $data(data)} + if {[string length $body]} { + uplevel #0 $parser parse [list $body] -dtdsubset external + } + $parser free + + return {} +} diff --git a/tclxml/tclxml-tcl/tclparser-8.0.tcl b/tclxml/tclxml-tcl/tclparser-8.0.tcl new file mode 100755 index 0000000..e2573f8 --- /dev/null +++ b/tclxml/tclxml-tcl/tclparser-8.0.tcl @@ -0,0 +1,359 @@ +# tclparser-8.0.tcl -- +# +# This file provides a Tcl implementation of a XML parser. +# This file supports Tcl 8.0. +# +# See xml-8.[01].tcl for definitions of character sets and +# regular expressions. +# +# Copyright (c) 2005-2008 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 1998-2004 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: tclparser-8.0.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require -exact Tcl 8.0 + +package require xmldefs 3.2 + +package require sgmlparser 1.0 + +package provide xml::tclparser 3.2 + +namespace eval xml { + + # Procedures for parsing XML documents + namespace export parser + # Procedures for parsing XML DTDs + namespace export DTDparser + + # Counter for creating unique parser objects + variable ParserCounter 0 + +} + +# xml::parser -- +# +# Creates XML parser object. +# +# Arguments: +# args Unique name for parser object +# plus option/value pairs +# +# Recognised Options: +# -final Indicates end of document data +# -elementstartcommand Called when an element starts +# -elementendcommand Called when an element ends +# -characterdatacommand Called when character data occurs +# -processinginstructioncommand Called when a PI occurs +# -externalentityrefcommand Called for an external entity reference +# +# (Not compatible with expat) +# -xmldeclcommand Called when the XML declaration occurs +# -doctypecommand Called when the document type declaration occurs +# +# -errorcommand Script to evaluate for a fatal error +# -warningcommand Script to evaluate for a reportable warning +# -statevariable global state variable +# -reportempty whether to provide empty element indication +# +# Results: +# The state variable is initialised. + +proc xml::parser {args} { + variable ParserCounter + + if {[llength $args] > 0} { + set name [lindex $args 0] + set args [lreplace $args 0 0] + } else { + set name parser[incr ParserCounter] + } + + if {[info command [namespace current]::$name] != {}} { + return -code error "unable to create parser object \"[namespace current]::$name\" command" + } + + # Initialise state variable and object command + upvar \#0 [namespace current]::$name parser + set sgml_ns [namespace parent]::sgml + array set parser [list name $name \ + -final 1 \ + -elementstartcommand ${sgml_ns}::noop \ + -elementendcommand ${sgml_ns}::noop \ + -characterdatacommand ${sgml_ns}::noop \ + -processinginstructioncommand ${sgml_ns}::noop \ + -externalentityrefcommand ${sgml_ns}::noop \ + -xmldeclcommand ${sgml_ns}::noop \ + -doctypecommand ${sgml_ns}::noop \ + -warningcommand ${sgml_ns}::noop \ + -statevariable [namespace current]::$name \ + -reportempty 0 \ + internaldtd {} \ + ] + + proc [namespace current]::$name {method args} \ + "eval ParseCommand $name \$method \$args" + + eval ParseCommand [list $name] configure $args + + return [namespace current]::$name +} + +# xml::ParseCommand -- +# +# Handles parse object command invocations +# +# Valid Methods: +# cget +# configure +# parse +# reset +# +# Arguments: +# parser parser object +# method minor command +# args other arguments +# +# Results: +# Depends on method + +proc xml::ParseCommand {parser method args} { + upvar \#0 [namespace current]::$parser state + + switch -- $method { + cget { + return $state([lindex $args 0]) + } + configure { + foreach {opt value} $args { + set state($opt) $value + } + } + parse { + ParseCommand_parse $parser [lindex $args 0] + } + reset { + if {[llength $args]} { + return -code error "too many arguments" + } + ParseCommand_reset $parser + } + default { + return -code error "unknown method \"$method\"" + } + } + + return {} +} + +# xml::ParseCommand_parse -- +# +# Parses document instance data +# +# Arguments: +# object parser object +# xml data +# +# Results: +# Callbacks are invoked, if any are defined + +proc xml::ParseCommand_parse {object xml} { + upvar \#0 [namespace current]::$object parser + variable Wsp + variable tokExpr + variable substExpr + + set parent [namespace parent] + if {![string compare :: $parent]} { + set parent {} + } + + set tokenised [lrange \ + [${parent}::sgml::tokenise $xml \ + $tokExpr \ + $substExpr \ + -internaldtdvariable [namespace current]::${object}(internaldtd)] \ + 4 end] + + eval ${parent}::sgml::parseEvent \ + [list $tokenised \ + -emptyelement [namespace code ParseEmpty] \ + -parseattributelistcommand [namespace code ParseAttrs]] \ + [array get parser -*command] \ + [array get parser -entityvariable] \ + [array get parser -reportempty] \ + [array get parser -final] \ + -normalize 0 \ + -internaldtd [list $parser(internaldtd)] + + return {} +} + +# xml::ParseEmpty -- Tcl 8.0 version +# +# Used by parser to determine whether an element is empty. +# This should be dead easy in XML. The only complication is +# that the RE above can't catch the trailing slash, so we have +# to dig it out of the tag name or attribute list. +# +# Tcl 8.1 REs should fix this. +# +# Arguments: +# tag element name +# attr attribute list (raw) +# e End tag delimiter. +# +# Results: +# "/" if the trailing slash is found. Optionally, return a list +# containing new values for the tag name and/or attribute list. + +proc xml::ParseEmpty {tag attr e} { + + if {[string match */ [string trimright $tag]] && \ + ![string length $attr]} { + regsub {/$} $tag {} tag + return [list / $tag $attr] + } elseif {[string match */ [string trimright $attr]]} { + regsub {/$} [string trimright $attr] {} attr + return [list / $tag $attr] + } else { + return {} + } + +} + +# xml::ParseAttrs -- +# +# Parse element attributes. +# +# There are two forms for name-value pairs: +# +# name="value" +# name='value' +# +# Watch out for the trailing slash on empty elements. +# +# Arguments: +# attrs attribute string given in a tag +# +# Results: +# Returns a Tcl list representing the name-value pairs in the +# attribute string + +proc xml::ParseAttrs attrs { + variable Wsp + variable Name + + # First check whether there's any work to do + if {![string compare {} [string trim $attrs]]} { + return {} + } + + # Strip the trailing slash on empty elements + regsub [format {/[%s]*$} " \t\n\r"] $attrs {} atList + + set mode name + set result {} + foreach component [split $atList =] { + switch $mode { + name { + set component [string trim $component] + if {[regexp $Name $component]} { + lappend result $component + } else { + return -code error "invalid attribute name \"$component\"" + } + set mode value:start + } + value:start { + set component [string trimleft $component] + set delimiter [string index $component 0] + set value {} + switch -- $delimiter { + \" - + ' { + if {[regexp [format {%s([^%s]*)%s(.*)} $delimiter $delimiter $delimiter] $component discard value remainder]} { + lappend result $value + set remainder [string trim $remainder] + if {[string length $remainder]} { + if {[regexp $Name $remainder]} { + lappend result $remainder + set mode value:start + } else { + return -code error "invalid attribute name \"$remainder\"" + } + } else { + set mode end + } + } else { + set value [string range $component 1 end] + set mode value:continue + } + } + default { + return -code error "invalid value for attribute \"[lindex $result end]\"" + } + } + } + value:continue { + if {[regexp [format {([^%s]*)%s(.*)} $delimiter $delimiter] $component discard valuepart remainder]} { + append value = $valuepart + lappend result $value + set remainder [string trim $remainder] + if {[string length $remainder]} { + if {[regexp $Name $remainder]} { + lappend result $remainder + set mode value:start + } else { + return -code error "invalid attribute name \"$remainder\"" + } + } else { + set mode end + } + } else { + append value = $component + } + } + end { + return -code error "unexpected data found after end of attribute list" + } + } + } + + switch $mode { + name - + end { + # This is normal + } + default { + return -code error "unexpected end of attribute list" + } + } + + return $result +} + +# xml::ParseCommand_reset -- +# +# Initialize parser data +# +# Arguments: +# object parser object +# +# Results: +# Parser data structure initialised + +proc xml::ParseCommand_reset object { + upvar \#0 [namespace current]::$object parser + + array set parser [list \ + -final 1 \ + internaldtd {} \ + ] +} + diff --git a/tclxml/tclxml-tcl/tclparser-8.1.tcl b/tclxml/tclxml-tcl/tclparser-8.1.tcl new file mode 100755 index 0000000..40a0af9 --- /dev/null +++ b/tclxml/tclxml-tcl/tclparser-8.1.tcl @@ -0,0 +1,614 @@ +# tclparser-8.1.tcl -- +# +# This file provides a Tcl implementation of a XML parser. +# This file supports Tcl 8.1. +# +# See xml-8.[01].tcl for definitions of character sets and +# regular expressions. +# +# Copyright (c) 2005-2008 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 1998-2003 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: tclparser-8.1.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require Tcl 8.1 + +package provide xml::tclparser 3.2 + +package require xmldefs 3.2 + +package require sgmlparser 1.0 + +namespace eval xml::tclparser { + + namespace export create createexternal externalentity parse configure get delete + + # Tokenising expressions + + variable tokExpr $::xml::tokExpr + variable substExpr $::xml::substExpr + + # Register this parser class + + ::xml::parserclass create tcl \ + -createcommand [namespace code create] \ + -createentityparsercommand [namespace code createentityparser] \ + -parsecommand [namespace code parse] \ + -configurecommand [namespace code configure] \ + -deletecommand [namespace code delete] \ + -resetcommand [namespace code reset] +} + +# xml::tclparser::create -- +# +# Creates XML parser object. +# +# Arguments: +# name unique identifier for this instance +# +# Results: +# The state variable is initialised. + +proc xml::tclparser::create name { + + # Initialise state variable + upvar \#0 [namespace current]::$name parser + array set parser [list -name $name \ + -cmd [uplevel 3 namespace current]::$name \ + -final 1 \ + -validate 0 \ + -statevariable [namespace current]::$name \ + -baseuri {} \ + internaldtd {} \ + entities [namespace current]::Entities$name \ + extentities [namespace current]::ExtEntities$name \ + parameterentities [namespace current]::PEntities$name \ + externalparameterentities [namespace current]::ExtPEntities$name \ + elementdecls [namespace current]::ElDecls$name \ + attlistdecls [namespace current]::AttlistDecls$name \ + notationdecls [namespace current]::NotDecls$name \ + depth 0 \ + leftover {} \ + ] + + # Initialise entities with predefined set + array set [namespace current]::Entities$name [array get ::sgml::EntityPredef] + + return $parser(-cmd) +} + +# xml::tclparser::createentityparser -- +# +# Creates XML parser object for an entity. +# +# Arguments: +# name name for the new parser +# parent name of parent parser +# +# Results: +# The state variable is initialised. + +proc xml::tclparser::createentityparser {parent name} { + upvar #0 [namespace current]::$parent p + + # Initialise state variable + upvar \#0 [namespace current]::$name external + array set external [array get p] + + regsub $parent $p(-cmd) {} parentns + + array set external [list -name $name \ + -cmd $parentns$name \ + -statevariable [namespace current]::$name \ + internaldtd {} \ + line 0 \ + ] + incr external(depth) + + return $external(-cmd) +} + +# xml::tclparser::configure -- +# +# Configures a XML parser object. +# +# Arguments: +# name unique identifier for this instance +# args option name/value pairs +# +# Results: +# May change values of config options + +proc xml::tclparser::configure {name args} { + upvar \#0 [namespace current]::$name parser + + # BUG: very crude, no checks for illegal args + # Mats: Should be synced with sgmlparser.tcl + set options {-elementstartcommand -elementendcommand \ + -characterdatacommand -processinginstructioncommand \ + -externalentitycommand -xmldeclcommand \ + -doctypecommand -commentcommand \ + -entitydeclcommand -unparsedentitydeclcommand \ + -parameterentitydeclcommand -notationdeclcommand \ + -elementdeclcommand -attlistdeclcommand \ + -paramentityparsing -defaultexpandinternalentities \ + -startdoctypedeclcommand -enddoctypedeclcommand \ + -entityreferencecommand -warningcommand \ + -defaultcommand -unknownencodingcommand -notstandalonecommand \ + -startcdatasectioncommand -endcdatasectioncommand \ + -errorcommand -final \ + -validate -baseuri -baseurl \ + -name -cmd -emptyelement \ + -parseattributelistcommand -parseentitydeclcommand \ + -normalize -internaldtd -dtdsubset \ + -reportempty -ignorewhitespace \ + -reportempty \ + } + set usage [join $options ", "] + regsub -all -- - $options {} options + set pat ^-([join $options |])$ + foreach {flag value} $args { + if {[regexp $pat $flag]} { + # Validate numbers + if {[info exists parser($flag)] && \ + [string is integer -strict $parser($flag)] && \ + ![string is integer -strict $value]} { + return -code error "Bad value for $flag ($value), must be integer" + } + set parser($flag) $value + } else { + return -code error "Unknown option $flag, can be: $usage" + } + } + + # Backward-compatibility: -baseuri is a synonym for -baseurl + catch {set parser(-baseuri) $parser(-baseurl)} + + return {} +} + +# xml::tclparser::parse -- +# +# Parses document instance data +# +# Arguments: +# name parser object +# xml data +# args configuration options +# +# Results: +# Callbacks are invoked + +proc xml::tclparser::parse {name xml args} { + + array set options $args + upvar \#0 [namespace current]::$name parser + variable tokExpr + variable substExpr + + # Mats: + if {[llength $args]} { + eval {configure $name} $args + } + + set parseOptions [list \ + -emptyelement [namespace code ParseEmpty] \ + -parseattributelistcommand [namespace code ParseAttrs] \ + -parseentitydeclcommand [namespace code ParseEntity] \ + -normalize 0] + eval lappend parseOptions \ + [array get parser -*command] \ + [array get parser -reportempty] \ + [array get parser -ignorewhitespace] \ + [array get parser -name] \ + [array get parser -cmd] \ + [array get parser -baseuri] \ + [array get parser -validate] \ + [array get parser -final] \ + [array get parser -defaultexpandinternalentities] \ + [array get parser entities] \ + [array get parser extentities] \ + [array get parser parameterentities] \ + [array get parser externalparameterentities] \ + [array get parser elementdecls] \ + [array get parser attlistdecls] \ + [array get parser notationdecls] + + # Mats: + # If -final 0 we also need to maintain the state with a -statevariable ! + if {!$parser(-final)} { + eval lappend parseOptions [array get parser -statevariable] + } + + set dtdsubset no + catch {set dtdsubset $options(-dtdsubset)} + switch -- $dtdsubset { + internal { + # Bypass normal parsing + lappend parseOptions -statevariable $parser(-statevariable) + array set intOptions [array get ::sgml::StdOptions] + array set intOptions $parseOptions + ::sgml::ParseDTD:Internal [array get intOptions] $xml + return {} + } + external { + # Bypass normal parsing + lappend parseOptions -statevariable $parser(-statevariable) + array set intOptions [array get ::sgml::StdOptions] + array set intOptions $parseOptions + ::sgml::ParseDTD:External [array get intOptions] $xml + return {} + } + default { + # Pass through to normal processing + } + } + + lappend tokenOptions \ + -internaldtdvariable [namespace current]::${name}(internaldtd) + + # Mats: If -final 0 we also need to maintain the state with a -statevariable ! + if {!$parser(-final)} { + eval lappend tokenOptions [array get parser -statevariable] \ + [array get parser -final] + } + + # Mats: + # Why not the first four? Just padding? Lrange undos \n interp. + # It is necessary to have the first four as well if chopped off in + # middle of pcdata. + set tokenised [lrange \ + [eval {::sgml::tokenise $xml $tokExpr $substExpr} $tokenOptions] \ + 0 end] + + lappend parseOptions -internaldtd [list $parser(internaldtd)] + eval ::sgml::parseEvent [list $tokenised] $parseOptions + + return {} +} + +# xml::tclparser::ParseEmpty -- Tcl 8.1+ version +# +# Used by parser to determine whether an element is empty. +# This is usually dead easy in XML, but as always not quite. +# Have to watch out for empty element syntax +# +# Arguments: +# tag element name +# attr attribute list (raw) +# e End tag delimiter. +# +# Results: +# Return value of e + +proc xml::tclparser::ParseEmpty {tag attr e} { + switch -glob [string length $e],[regexp "/[::xml::cl $::xml::Wsp]*$" $attr] { + 0,0 { + return {} + } + 0,* { + return / + } + default { + return $e + } + } +} + +# xml::tclparser::ParseAttrs -- Tcl 8.1+ version +# +# Parse element attributes. +# +# There are two forms for name-value pairs: +# +# name="value" +# name='value' +# +# Arguments: +# opts parser options +# attrs attribute string given in a tag +# +# Results: +# Returns a Tcl list representing the name-value pairs in the +# attribute string +# +# A ">" occurring in the attribute list causes problems when parsing +# the XML. This manifests itself by an unterminated attribute value +# and a ">" appearing the element text. +# In this case return a three element list; +# the message "unterminated attribute value", the attribute list it +# did manage to parse and the remainder of the attribute list. + +proc xml::tclparser::ParseAttrs {opts attrs} { + + set result {} + + while {[string length [string trim $attrs]]} { + if {[regexp [::sgml::cl $::xml::Wsp]*($::xml::Name)[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')([::sgml::cl ^<]*?)\\2(.*) $attrs discard attrName delimiter value attrs]} { + lappend result $attrName [NormalizeAttValue $opts $value] + } elseif {[regexp [::sgml::cl $::xml::Wsp]*$::xml::Name[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')[::sgml::cl ^<]*\$ $attrs]} { + return -code error [list {unterminated attribute value} $result $attrs] + } else { + return -code error "invalid attribute list" + } + } + + return $result +} + +# xml::tclparser::NormalizeAttValue -- +# +# Perform attribute value normalisation. This involves: +# . character references are appended to the value +# . entity references are recursively processed and replacement value appended +# . whitespace characters cause a space to be appended +# . other characters appended as-is +# +# Arguments: +# opts parser options +# value unparsed attribute value +# +# Results: +# Normalised value returned. + +proc xml::tclparser::NormalizeAttValue {opts value} { + + # sgmlparser already has backslashes protected + # Protect Tcl specials + regsub -all {([][$])} $value {\\\1} value + + # Deal with white space + regsub -all "\[$::xml::Wsp\]" $value { } value + + # Find entity refs + regsub -all {&([^;]+);} $value {[NormalizeAttValue:DeRef $opts {\1}]} value + + return [subst $value] +} + +# xml::tclparser::NormalizeAttValue:DeRef -- +# +# Handler to normalize attribute values +# +# Arguments: +# opts parser options +# ref entity reference +# +# Results: +# Returns character + +proc xml::tclparser::NormalizeAttValue:DeRef {opts ref} { + # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap + switch -glob -- $ref { + {#x*} { + scan [string range $ref 2 end] %x value + set char [format %c $value] + # Check that the char is legal for XML + if {[regexp [format {^[%s]$} $::xml::Char] $char]} { + return $char + } else { + return -code error "illegal character" + } + } + {#*} { + scan [string range $ref 1 end] %d value + set char [format %c $value] + # Check that the char is legal for XML + if {[regexp [format {^[%s]$} $::xml::Char] $char]} { + return $char + } else { + return -code error "illegal character" + } + } + lt - + gt - + amp - + quot - + apos { + array set map {lt < gt > amp & quot \" apos '} + return $map($ref) + } + default { + # A general entity. Must resolve to a text value - no element structure. + + array set options $opts + upvar #0 $options(entities) map + + if {[info exists map($ref)]} { + + if {[regexp < $map($ref)]} { + return -code error "illegal character \"<\" in attribute value" + } + + if {![regexp & $map($ref)]} { + # Simple text replacement + return $map($ref) + } + + # There are entity references in the replacement text. + # Can't use child entity parser since must catch element structures + + return [NormalizeAttValue $opts $map($ref)] + + } elseif {[string compare $options(-entityreferencecommand) "::sgml::noop"]} { + + set result [uplevel #0 $options(-entityreferencecommand) [list $ref]] + + return $result + + } else { + return -code error "unable to resolve entity reference \"$ref\"" + } + } + } +} + +# xml::tclparser::ParseEntity -- +# +# Parse general entity declaration +# +# Arguments: +# data text to parse +# +# Results: +# Tcl list containing entity declaration + +proc xml::tclparser::ParseEntity data { + set data [string trim $data] + if {[regexp $::sgml::ExternalEntityExpr $data discard type delimiter1 id1 discard delimiter2 id2 optNDATA ndata]} { + switch $type { + PUBLIC { + return [list external $id2 $id1 $ndata] + } + SYSTEM { + return [list external $id1 {} $ndata] + } + } + } elseif {[regexp {^("|')(.*?)\1$} $data discard delimiter value]} { + return [list internal $value] + } else { + return -code error "badly formed entity declaration" + } +} + +# xml::tclparser::delete -- +# +# Destroy parser data +# +# Arguments: +# name parser object +# +# Results: +# Parser data structure destroyed + +proc xml::tclparser::delete name { + upvar \#0 [namespace current]::$name parser + catch {::sgml::ParserDelete $parser(-statevariable)} + catch {unset parser} + return {} +} + +# xml::tclparser::get -- +# +# Retrieve additional information from the parser +# +# Arguments: +# name parser object +# method info to retrieve +# args additional arguments for method +# +# Results: +# Depends on method + +proc xml::tclparser::get {name method args} { + upvar #0 [namespace current]::$name parser + + switch -- $method { + + elementdecl { + switch [llength $args] { + + 0 { + # Return all element declarations + upvar #0 $parser(elementdecls) elements + return [array get elements] + } + + 1 { + # Return specific element declaration + upvar #0 $parser(elementdecls) elements + if {[info exists elements([lindex $args 0])]} { + return [array get elements [lindex $args 0]] + } else { + return -code error "element \"[lindex $args 0]\" not declared" + } + } + + default { + return -code error "wrong number of arguments: should be \"elementdecl ?element?\"" + } + } + } + + attlist { + if {[llength $args] != 1} { + return -code error "wrong number of arguments: should be \"get attlist element\"" + } + + upvar #0 $parser(attlistdecls) + + return {} + } + + entitydecl { + } + + parameterentitydecl { + } + + notationdecl { + } + + default { + return -code error "unknown method \"$method\"" + } + } + + return {} +} + +# xml::tclparser::ExternalEntity -- +# +# Resolve and parse external entity +# +# Arguments: +# name parser object +# base base URL +# sys system identifier +# pub public identifier +# +# Results: +# External entity is fetched and parsed + +proc xml::tclparser::ExternalEntity {name base sys pub} { +} + +# xml::tclparser:: -- +# +# Reset a parser instance, ready to parse another document +# +# Arguments: +# name parser object +# +# Results: +# Variables unset + +proc xml::tclparser::reset {name} { + upvar \#0 [namespace current]::$name parser + + # Has this parser object been properly initialised? + if {![info exists parser] || \ + ![info exists parser(-name)]} { + return [create $name] + } + + array set parser { + -final 1 + depth 0 + leftover {} + } + + foreach var {Entities ExtEntities PEntities ExtPEntities ElDecls AttlistDecls NotDecls} { + catch {unset [namespace current]::${var}$name} + } + + # Initialise entities with predefined set + array set [namespace current]::Entities$name [array get ::sgml::EntityPredef] + + return {} +} diff --git a/tclxml/tclxml-tcl/xml-8.0.tcl b/tclxml/tclxml-tcl/xml-8.0.tcl new file mode 100755 index 0000000..db28423 --- /dev/null +++ b/tclxml/tclxml-tcl/xml-8.0.tcl @@ -0,0 +1,92 @@ +# xml-8.0.tcl -- +# +# This file provides generic XML services for all implementations. +# This file supports Tcl 8.0 regular expressions. +# +# See xmlparse.tcl for the Tcl implementation of a XML parser. +# +# Copyright (c) 2005 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 1998-2004 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: xml-8.0.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require -exact Tcl 8.0 + +package require sgml 1.8 + +package provide xmldefs 3.2 + +namespace eval xml { + + # Convenience routine + proc cl x { + return "\[$x\]" + } + + # Define various regular expressions + + # Characters + variable Char $::sgml::Char + + # white space + variable Wsp " \t\r\n" + variable noWsp [cl ^$Wsp] + + # Various XML names and tokens + + variable NameChar $::sgml::NameChar + variable Name $::sgml::Name + variable Names $::sgml::Names + variable Nmtoken $::sgml::Nmtoken + variable Nmtokens $::sgml::Nmtokens + + # The definition of the Namespace URI for XML Namespaces themselves. + # The prefix 'xml' is automatically bound to this URI. + variable xmlnsNS http://www.w3.org/XML/1998/namespace + + # Tokenising expressions + + variable tokExpr <(/?)([cl ^$Wsp>/]+)([cl $Wsp]*[cl ^>]*)> + variable substExpr "\}\n{\\2} {\\1} {\\3} \{" + + # table of predefined entities + + variable EntityPredef + array set EntityPredef { + lt < gt > amp & quot \" apos ' + } + +} + +### +### General utility procedures +### + +# xml::noop -- +# +# A do-nothing proc + +proc xml::noop args {} + +### Following procedures are based on html_library + +# xml::zapWhite -- +# +# Convert multiple white space into a single space. +# +# Arguments: +# data plain text +# +# Results: +# As above + +proc xml::zapWhite data { + regsub -all "\[ \t\r\n\]+" $data { } data + return $data +} + diff --git a/tclxml/tclxml-tcl/xml-8.1.tcl b/tclxml/tclxml-tcl/xml-8.1.tcl new file mode 100755 index 0000000..5ec410b --- /dev/null +++ b/tclxml/tclxml-tcl/xml-8.1.tcl @@ -0,0 +1,135 @@ +# xml.tcl -- +# +# This file provides generic XML services for all implementations. +# This file supports Tcl 8.1 regular expressions. +# +# See tclparser.tcl for the Tcl implementation of a XML parser. +# +# Copyright (c) 2005 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 1998-2004 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: xml-8.1.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require Tcl 8.1 + +package provide xmldefs 3.2 + +package require sgml 1.8 + +namespace eval xml { + + namespace export qnamesplit + + # Convenience routine + proc cl x { + return "\[$x\]" + } + + # Define various regular expressions + + # Characters + variable Char $::sgml::Char + + # white space + variable Wsp " \t\r\n" + variable allWsp [cl $Wsp]* + variable noWsp [cl ^$Wsp] + + # Various XML names and tokens + + variable NameChar $::sgml::NameChar + variable Name $::sgml::Name + variable Names $::sgml::Names + variable Nmtoken $::sgml::Nmtoken + variable Nmtokens $::sgml::Nmtokens + + # XML Namespaces names + + # NCName ::= Name - ':' + variable NCName $::sgml::Name + regsub -all : $NCName {} NCName + variable QName (${NCName}:)?$NCName ;# (Prefix ':')? LocalPart + + # The definition of the Namespace URI for XML Namespaces themselves. + # The prefix 'xml' is automatically bound to this URI. + variable xmlnsNS http://www.w3.org/XML/1998/namespace + + # table of predefined entities + + variable EntityPredef + array set EntityPredef { + lt < gt > amp & quot \" apos ' + } + + # Expressions for pulling things apart + variable tokExpr <(/?)([::xml::cl ^$::xml::Wsp>/]+)([::xml::cl $::xml::Wsp]*[::xml::cl ^>]*)> + variable substExpr "\}\n{\\2} {\\1} {\\3} \{" + +} + +### +### Exported procedures +### + +# xml::qnamesplit -- +# +# Split a QName into its constituent parts: +# the XML Namespace prefix and the Local-name +# +# Arguments: +# qname XML Qualified Name (see XML Namespaces [6]) +# +# Results: +# Returns prefix and local-name as a Tcl list. +# Error condition returned if the prefix or local-name +# are not valid NCNames (XML Name) + +proc xml::qnamesplit qname { + variable NCName + variable Name + + set prefix {} + set localname $qname + if {[regexp : $qname]} { + if {![regexp ^($NCName)?:($NCName)\$ $qname discard prefix localname]} { + return -code error "name \"$qname\" is not a valid QName" + } + } elseif {![regexp ^$Name\$ $qname]} { + return -code error "name \"$qname\" is not a valid Name" + } + + return [list $prefix $localname] +} + +### +### General utility procedures +### + +# xml::noop -- +# +# A do-nothing proc + +proc xml::noop args {} + +### Following procedures are based on html_library + +# xml::zapWhite -- +# +# Convert multiple white space into a single space. +# +# Arguments: +# data plain text +# +# Results: +# As above + +proc xml::zapWhite data { + regsub -all "\[ \t\r\n\]+" $data { } data + return $data +} + diff --git a/tclxml/tclxml-tcl/xml__tcl.tcl b/tclxml/tclxml-tcl/xml__tcl.tcl new file mode 100644 index 0000000..bdb7bd9 --- /dev/null +++ b/tclxml/tclxml-tcl/xml__tcl.tcl @@ -0,0 +1,272 @@ +# xml__tcl.tcl -- +# +# This file provides a Tcl implementation of the parser +# class support found in ../tclxml.c. It is only used +# when the C implementation is not installed (for some reason). +# +# Copyright (c) 2005 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 2000-2004 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: xml__tcl.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package provide xml::tcl 3.2 + +namespace eval xml { + namespace export configure parser parserclass + + # Parser implementation classes + variable classes + array set classes {} + + # Default parser class + variable default {} + + # Counter for generating unique names + variable counter 0 +} + +# xml::configure -- +# +# Configure the xml package +# +# Arguments: +# None +# +# Results: +# None (not yet implemented) + +proc xml::configure args {} + +# xml::parserclass -- +# +# Implements the xml::parserclass command for managing +# parser implementations. +# +# Arguments: +# method subcommand +# args method arguments +# +# Results: +# Depends on method + +proc xml::parserclass {method args} { + variable classes + variable default + + switch -- $method { + + create { + if {[llength $args] < 1} { + return -code error "wrong number of arguments, should be xml::parserclass create name ?args?" + } + + set name [lindex $args 0] + if {[llength [lrange $args 1 end]] % 2} { + return -code error "missing value for option \"[lindex $args end]\"" + } + array set classes [list $name [list \ + -createcommand [namespace current]::noop \ + -createentityparsercommand [namespace current]::noop \ + -parsecommand [namespace current]::noop \ + -configurecommand [namespace current]::noop \ + -getcommand [namespace current]::noop \ + -deletecommand [namespace current]::noop \ + ]] + # BUG: we're not checking that the arguments are kosher + set classes($name) [lrange $args 1 end] + set default $name + } + + destroy { + if {[llength $args] < 1} { + return -code error "wrong number of arguments, should be xml::parserclass destroy name" + } + + if {[info exists classes([lindex $args 0])]} { + unset classes([lindex $args 0]) + } else { + return -code error "no such parser class \"[lindex $args 0]\"" + } + } + + info { + if {[llength $args] < 1} { + return -code error "wrong number of arguments, should be xml::parserclass info method" + } + + switch -- [lindex $args 0] { + names { + return [array names classes] + } + default { + return $default + } + } + } + + default { + return -code error "unknown method \"$method\"" + } + } + + return {} +} + +# xml::parser -- +# +# Create a parser object instance +# +# Arguments: +# args optional name, configuration options +# +# Results: +# Returns object name. Parser instance created. + +proc xml::parser args { + variable classes + variable default + + if {[llength $args] < 1} { + # Create unique name, no options + set parserName [FindUniqueName] + } else { + if {[string index [lindex $args 0] 0] == "-"} { + # Create unique name, have options + set parserName [FindUniqueName] + } else { + # Given name, optional options + set parserName [lindex $args 0] + set args [lrange $args 1 end] + } + } + + array set options [list \ + -parser $default + ] + array set options $args + + if {![info exists classes($options(-parser))]} { + return -code error "no such parser class \"$options(-parser)\"" + } + + # Now create the parser instance command and data structure + # The command must be created in the caller's namespace + uplevel 1 [list proc $parserName {method args} "eval [namespace current]::ParserCmd [list $parserName] \[list \$method\] \$args"] + upvar #0 [namespace current]::$parserName data + array set data [list class $options(-parser)] + + array set classinfo $classes($options(-parser)) + if {[string compare $classinfo(-createcommand) ""]} { + eval $classinfo(-createcommand) [list $parserName] + } + if {[string compare $classinfo(-configurecommand) ""] && \ + [llength $args]} { + eval $classinfo(-configurecommand) [list $parserName] $args + } + + return $parserName +} + +# xml::FindUniqueName -- +# +# Generate unique object name +# +# Arguments: +# None +# +# Results: +# Returns string. + +proc xml::FindUniqueName {} { + variable counter + return xmlparser[incr counter] +} + +# xml::ParserCmd -- +# +# Implements parser object command +# +# Arguments: +# name object reference +# method subcommand +# args method arguments +# +# Results: +# Depends on method + +proc xml::ParserCmd {name method args} { + variable classes + upvar #0 [namespace current]::$name data + + array set classinfo $classes($data(class)) + + switch -- $method { + + configure { + # BUG: We're not checking for legal options + array set data $args + eval $classinfo(-configurecommand) [list $name] $args + return {} + } + + cget { + return $data([lindex $args 0]) + } + + entityparser { + set new [FindUniqueName] + + upvar #0 [namespace current]::$name parent + upvar #0 [namespace current]::$new data + array set data [array get parent] + + uplevel 1 [list proc $new {method args} "eval [namespace current]::ParserCmd [list $new] \[list \$method\] \$args"] + + return [eval $classinfo(-createentityparsercommand) [list $name $new] $args] + } + + free { + eval $classinfo(-deletecommand) [list $name] + unset data + uplevel 1 [list rename $name {}] + } + + get { + eval $classinfo(-getcommand) [list $name] $args + } + + parse { + if {[llength $args] < 1} { + return -code error "wrong number of arguments, should be $name parse xml ?options?" + } + eval $classinfo(-parsecommand) [list $name] $args + } + + reset { + eval $classinfo(-resetcommand) [list $name] + } + + default { + return -code error "unknown method" + } + } + + return {} +} + +# xml::noop -- +# +# Do nothing utility proc +# +# Arguments: +# args whatever +# +# Results: +# Nothing happens + +proc xml::noop args {} diff --git a/tclxml/tclxml-tcl/xmldep.tcl b/tclxml/tclxml-tcl/xmldep.tcl new file mode 100644 index 0000000..bbb2613 --- /dev/null +++ b/tclxml/tclxml-tcl/xmldep.tcl @@ -0,0 +1,179 @@ +# xmldep.tcl -- +# +# Find the dependencies in an XML document. +# Supports external entities and XSL include/import. +# +# TODO: +# XInclude +# +# Copyright (c) 2001-2003 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: xmldep.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require xml + +package provide xml::dep 1.0 + +namespace eval xml::dep { + namespace export depend + + variable extEntities + array set extEntities {} + + variable XSLTNS http://www.w3.org/1999/XSL/Transform +} + +# xml::dep::depend -- +# +# Find the resources which an XML document +# depends on. The document is parsed +# sequentially, rather than using DOM, for efficiency. +# +# TODO: +# Asynchronous parsing. +# +# Arguments: +# xml XML document entity +# args configuration options +# +# Results: +# Returns list of resource (system) identifiers + +proc xml::dep::depend {xml args} { + variable resources + variable entities + + set resources {} + catch {unset entities} + array set entities {} + + set p [xml::parser \ + -elementstartcommand [namespace code ElStart] \ + -doctypecommand [namespace code DocTypeDecl] \ + -entitydeclcommand [namespace code EntityDecl] \ + -entityreferencecommand [namespace code EntityReference] \ + -validate 1 \ + ] + if {[llength $args]} { + eval [list $p] configure $args + } + $p parse $xml + + return $resources +} + +# xml::dep::ElStart -- +# +# Process start element +# +# Arguments: +# name tag name +# atlist attribute list +# args options +# +# Results: +# May add to resources list + +proc xml::dep::ElStart {name atlist args} { + variable XSLTNS + variable resources + + array set opts { + -namespace {} + } + array set opts $args + + switch -- $opts(-namespace) \ + $XSLTNS { + switch $name { + import - + include { + array set attr { + href {} + } + array set attr $atlist + + if {[string length $attr(href)]} { + if {[lsearch $resources $attr(href)] < 0} { + lappend resources $attr(href) + } + } + + } + } + } +} + +# xml::dep::DocTypeDecl -- +# +# Process Document Type Declaration +# +# Arguments: +# name Document element +# pubid Public identifier +# sysid System identifier +# dtd Internal DTD Subset +# +# Results: +# Resource added to list + +proc xml::dep::DocTypeDecl {name pubid sysid dtd} { + variable resources + + puts stderr [list DocTypeDecl $name $pubid $sysid dtd] + + if {[string length $sysid] && \ + [lsearch $resources $sysid] < 0} { + lappend resources $sysid + } + + return {} +} + +# xml::dep::EntityDecl -- +# +# Process entity declaration, looking for external entity +# +# Arguments: +# name entity name +# sysid system identifier +# pubid public identifier or repl. text +# +# Results: +# Store external entity info for later reference + +proc xml::dep::EntityDecl {name sysid pubid} { + variable extEntities + + puts stderr [list EntityDecl $name $sysid $pubid] + + set extEntities($name) $sysid +} + +# xml::dep::EntityReference -- +# +# Process entity reference +# +# Arguments: +# name entity name +# +# Results: +# May add to resources list + +proc xml::dep::EntityReference name { + variable extEntities + variable resources + + puts stderr [list EntityReference $name] + + if {[info exists extEntities($name)] && \ + [lsearch $resources $extEntities($name)] < 0} { + lappend resources $extEntities($name) + } + +} + diff --git a/tclxml/tclxml-tcl/xpath.tcl b/tclxml/tclxml-tcl/xpath.tcl new file mode 100644 index 0000000..e772e67 --- /dev/null +++ b/tclxml/tclxml-tcl/xpath.tcl @@ -0,0 +1,362 @@ +# xpath.tcl -- +# +# Provides an XPath parser for Tcl, +# plus various support procedures +# +# Copyright (c) 2000-2003 Zveno Pty Ltd +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: xpath.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package provide xpath 1.0 + +# We need the XML package for definition of Names +package require xml + +namespace eval xpath { + namespace export split join createnode + + variable axes { + ancestor + ancestor-or-self + attribute + child + descendant + descendant-or-self + following + following-sibling + namespace + parent + preceding + preceding-sibling + self + } + + variable nodeTypes { + comment + text + processing-instruction + node + } + + # NB. QName has parens for prefix + + variable nodetestExpr ^(${::xml::QName})${::xml::allWsp}(\\(${::xml::allWsp}(("|')(.*?)\\5)?${::xml::allWsp}\\))?${::xml::allWsp}(.*) + + variable nodetestExpr2 ((($::xml::QName)${::xml::allWsp}(\\(${::xml::allWsp}(("|')(.*?)\\7)?${::xml::allWsp}\\))?)|${::xml::allWsp}(\\*))${::xml::allWsp}(.*) +} + +# xpath::split -- +# +# Parse an XPath location path +# +# Arguments: +# locpath location path +# +# Results: +# A Tcl list representing the location path. +# The list has the form: {{axis node-test {predicate predicate ...}} ...} +# Where each list item is a location step. + +proc xpath::split locpath { + set leftover {} + + set result [InnerSplit $locpath leftover] + + if {[string length [string trim $leftover]]} { + return -code error "unexpected text \"$leftover\"" + } + + return $result +} + +proc xpath::InnerSplit {locpath leftoverVar} { + upvar $leftoverVar leftover + + variable axes + variable nodetestExpr + variable nodetestExpr2 + + # First determine whether we have an absolute location path + if {[regexp {^/(.*)} $locpath discard locpath]} { + set path {{}} + } else { + set path {} + } + + while {[string length [string trimleft $locpath]]} { + if {[regexp {^\.\.(.*)} $locpath discard locpath]} { + # .. abbreviation + set axis parent + set nodetest * + } elseif {[regexp {^/(.*)} $locpath discard locpath]} { + # // abbreviation + set axis descendant-or-self + if {[regexp ^$nodetestExpr2 [string trimleft $locpath] discard discard discard nodetest discard typetest discard discard literal wildcard locpath]} { + set nodetest [ResolveWildcard $nodetest $typetest $wildcard $literal] + } else { + set leftover $locpath + return $path + } + } elseif {[regexp ^\\.${::xml::allWsp}(.*) $locpath discard locpath]} { + # . abbreviation + set axis self + set nodetest * + } elseif {[regexp ^@($::xml::QName)${::xml::allWsp}=${::xml::allWsp}"(\[^"\])"(.*) $locpath discard attrName discard attrValue locpath]} { + # @ abbreviation + set axis attribute + set nodetest $attrName + } elseif {[regexp ^@($::xml::QName)${::xml::allWsp}=${::xml::allWsp}'(\[^'\])'(.*) $locpath discard attrName discard attrValue locpath]} { + # @ abbreviation + set axis attribute + set nodetest $attrName + } elseif {[regexp ^@($::xml::QName)(.*) $locpath discard attrName discard2 locpath]} { + # @ abbreviation + set axis attribute + set nodetest $attrName + } elseif {[regexp ^((${::xml::QName})${::xml::allWsp}::${::xml::allWsp})?\\*(.*) $locpath discard discard axis discard locpath]} { + # wildcard specified + set nodetest * + if {![string length $axis]} { + set axis child + } + } elseif {[regexp ^((${::xml::QName})${::xml::allWsp}::${::xml::allWsp})?$nodetestExpr2 $locpath discard discard axis discard discard discard nodetest discard typetest discard discard literal wildcard locpath]} { + # nodetest, with or without axis + if {![string length $axis]} { + set axis child + } + set nodetest [ResolveWildcard $nodetest $typetest $wildcard $literal] + } else { + set leftover $locpath + return $path + } + + # ParsePredicates + set predicates {} + set locpath [string trimleft $locpath] + while {[regexp {^\[(.*)} $locpath discard locpath]} { + if {[regexp {^([0-9]+)(\].*)} [string trim $locpath] discard posn locpath]} { + set predicate [list = {function position {}} [list number $posn]] + } else { + set leftover2 {} + set predicate [ParseExpr $locpath leftover2] + set locpath $leftover2 + unset leftover2 + } + + if {[regexp {^\](.*)} [string trimleft $locpath] discard locpath]} { + lappend predicates $predicate + } else { + return -code error "unexpected text in predicate \"$locpath\"" + } + } + + set axis [string trim $axis] + set nodetest [string trim $nodetest] + + # This step completed + if {[lsearch $axes $axis] < 0} { + return -code error "invalid axis \"$axis\"" + } + lappend path [list $axis $nodetest $predicates] + + # Move to next step + + if {[string length $locpath] && ![regexp ^/(.*) $locpath discard locpath]} { + set leftover $locpath + return $path + } + + } + + return $path +} + +# xpath::ParseExpr -- +# +# Parse one expression in a predicate +# +# Arguments: +# locpath location path to parse +# leftoverVar Name of variable in which to store remaining path +# +# Results: +# Returns parsed expression as a Tcl list + +proc xpath::ParseExpr {locpath leftoverVar} { + upvar $leftoverVar leftover + variable nodeTypes + + set expr {} + set mode expr + set stack {} + + while {[string index [string trimleft $locpath] 0] != "\]"} { + set locpath [string trimleft $locpath] + switch $mode { + expr { + # We're looking for a term + if {[regexp ^-(.*) $locpath discard locpath]} { + # UnaryExpr + lappend stack "-" + } elseif {[regexp ^\\\$({$::xml::QName})(.*) $locpath discard varname discard locpath]} { + # VariableReference + lappend stack [list varRef $varname] + set mode term + } elseif {[regexp {^\((.*)} $locpath discard locpath]} { + # Start grouping + set leftover2 {} + lappend stack [list group [ParseExpr $locpath leftover2]] + set locpath $leftover2 + unset leftover2 + + if {[regexp {^\)(.*)} [string trimleft $locpath] discard locpath]} { + set mode term + } else { + return -code error "unexpected text \"$locpath\", expected \")\"" + } + + } elseif {[regexp {^"([^"]*)"(.*)} $locpath discard literal locpath]} { + # Literal (" delimited) + lappend stack [list literal $literal] + set mode term + } elseif {[regexp {^'([^']*)'(.*)} $locpath discard literal locpath]} { + # Literal (' delimited) + lappend stack [list literal $literal] + set mode term + } elseif {[regexp {^([0-9]+(\.[0-9]+)?)(.*)} $locpath discard number discard locpath]} { + # Number + lappend stack [list number $number] + set mode term + } elseif {[regexp {^(\.[0-9]+)(.*)} $locpath discard number locpath]} { + # Number + lappend stack [list number $number] + set mode term + } elseif {[regexp ^(${::xml::QName})\\(${::xml::allWsp}(.*) $locpath discard functionName discard locpath]} { + # Function call start or abbreviated node-type test + + if {[lsearch $nodeTypes $functionName] >= 0} { + # Looking like a node-type test + if {[regexp ^\\)${::xml::allWsp}(.*) $locpath discard locpath]} { + lappend stack [list path [list child [list $functionName ()] {}]] + set mode term + } else { + return -code error "invalid node-type test \"$functionName\"" + } + } else { + if {[regexp ^\\)${::xml::allWsp}(.*) $locpath discard locpath]} { + set parameters {} + } else { + set leftover2 {} + set parameters [ParseExpr $locpath leftover2] + set locpath $leftover2 + unset leftover2 + while {[regexp {^,(.*)} $locpath discard locpath]} { + set leftover2 {} + lappend parameters [ParseExpr $locpath leftover2] + set locpath $leftover2 + unset leftover2 + } + + if {![regexp ^\\)${::xml::allWsp}(.*) [string trimleft $locpath] discard locpath]} { + return -code error "unexpected text \"locpath\" - expected \")\"" + } + } + + lappend stack [list function $functionName $parameters] + set mode term + } + + } else { + # LocationPath + set leftover2 {} + lappend stack [list path [InnerSplit $locpath leftover2]] + set locpath $leftover2 + unset leftover2 + set mode term + } + } + term { + # We're looking for an expression operator + if {[regexp ^-(.*) $locpath discard locpath]} { + # UnaryExpr + set stack [linsert $stack 0 expr "-"] + set mode expr + } elseif {[regexp ^(and|or|\\=|!\\=|<|>|<\\=|>\\=|\\||\\+|\\-|\\*|div|mod)(.*) $locpath discard exprtype locpath]} { + # AndExpr, OrExpr, EqualityExpr, RelationalExpr or UnionExpr + set stack [linsert $stack 0 $exprtype] + set mode expr + } else { + return -code error "unexpected text \"$locpath\", expecting operator" + } + } + default { + # Should never be here! + return -code error "internal error" + } + } + } + + set leftover $locpath + return $stack +} + +# xpath::ResolveWildcard -- + +proc xpath::ResolveWildcard {nodetest typetest wildcard literal} { + variable nodeTypes + + switch -glob -- [string length $nodetest],[string length $typetest],[string length $wildcard],[string length $literal] { + 0,0,0,* { + return -code error "bad location step (nothing parsed)" + } + 0,0,* { + # Name wildcard specified + return * + } + *,0,0,* { + # Element type test - nothing to do + return $nodetest + } + *,0,*,* { + # Internal error? + return -code error "bad location step (found both nodetest and wildcard)" + } + *,*,0,0 { + # Node type test + if {[lsearch $nodeTypes $nodetest] < 0} { + return -code error "unknown node type \"$typetest\"" + } + return [list $nodetest $typetest] + } + *,*,0,* { + # Node type test + if {[lsearch $nodeTypes $nodetest] < 0} { + return -code error "unknown node type \"$typetest\"" + } + return [list $nodetest $literal] + } + default { + # Internal error? + return -code error "bad location step" + } + } +} + +# xpath::join -- +# +# Reconstitute an XPath location path from a +# Tcl list representation. +# +# Arguments: +# spath split path +# +# Results: +# Returns an Xpath location path + +proc xpath::join spath { + return -code error "not yet implemented" +} + |