diff options
Diffstat (limited to 'tclxml/tclxml-tcl/tclparser-8.1.tcl')
-rwxr-xr-x | tclxml/tclxml-tcl/tclparser-8.1.tcl | 614 |
1 files changed, 614 insertions, 0 deletions
diff --git a/tclxml/tclxml-tcl/tclparser-8.1.tcl b/tclxml/tclxml-tcl/tclparser-8.1.tcl new file mode 100755 index 0000000..40a0af9 --- /dev/null +++ b/tclxml/tclxml-tcl/tclparser-8.1.tcl @@ -0,0 +1,614 @@ +# tclparser-8.1.tcl -- +# +# This file provides a Tcl implementation of a XML parser. +# This file supports Tcl 8.1. +# +# See xml-8.[01].tcl for definitions of character sets and +# regular expressions. +# +# Copyright (c) 2005-2008 by Explain. +# http://www.explain.com.au/ +# Copyright (c) 1998-2003 Zveno Pty Ltd +# http://www.zveno.com/ +# +# See the file "LICENSE" in this distribution for information on usage and +# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# $Id: tclparser-8.1.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $ + +package require Tcl 8.1 + +package provide xml::tclparser 3.2 + +package require xmldefs 3.2 + +package require sgmlparser 1.0 + +namespace eval xml::tclparser { + + namespace export create createexternal externalentity parse configure get delete + + # Tokenising expressions + + variable tokExpr $::xml::tokExpr + variable substExpr $::xml::substExpr + + # Register this parser class + + ::xml::parserclass create tcl \ + -createcommand [namespace code create] \ + -createentityparsercommand [namespace code createentityparser] \ + -parsecommand [namespace code parse] \ + -configurecommand [namespace code configure] \ + -deletecommand [namespace code delete] \ + -resetcommand [namespace code reset] +} + +# xml::tclparser::create -- +# +# Creates XML parser object. +# +# Arguments: +# name unique identifier for this instance +# +# Results: +# The state variable is initialised. + +proc xml::tclparser::create name { + + # Initialise state variable + upvar \#0 [namespace current]::$name parser + array set parser [list -name $name \ + -cmd [uplevel 3 namespace current]::$name \ + -final 1 \ + -validate 0 \ + -statevariable [namespace current]::$name \ + -baseuri {} \ + internaldtd {} \ + entities [namespace current]::Entities$name \ + extentities [namespace current]::ExtEntities$name \ + parameterentities [namespace current]::PEntities$name \ + externalparameterentities [namespace current]::ExtPEntities$name \ + elementdecls [namespace current]::ElDecls$name \ + attlistdecls [namespace current]::AttlistDecls$name \ + notationdecls [namespace current]::NotDecls$name \ + depth 0 \ + leftover {} \ + ] + + # Initialise entities with predefined set + array set [namespace current]::Entities$name [array get ::sgml::EntityPredef] + + return $parser(-cmd) +} + +# xml::tclparser::createentityparser -- +# +# Creates XML parser object for an entity. +# +# Arguments: +# name name for the new parser +# parent name of parent parser +# +# Results: +# The state variable is initialised. + +proc xml::tclparser::createentityparser {parent name} { + upvar #0 [namespace current]::$parent p + + # Initialise state variable + upvar \#0 [namespace current]::$name external + array set external [array get p] + + regsub $parent $p(-cmd) {} parentns + + array set external [list -name $name \ + -cmd $parentns$name \ + -statevariable [namespace current]::$name \ + internaldtd {} \ + line 0 \ + ] + incr external(depth) + + return $external(-cmd) +} + +# xml::tclparser::configure -- +# +# Configures a XML parser object. +# +# Arguments: +# name unique identifier for this instance +# args option name/value pairs +# +# Results: +# May change values of config options + +proc xml::tclparser::configure {name args} { + upvar \#0 [namespace current]::$name parser + + # BUG: very crude, no checks for illegal args + # Mats: Should be synced with sgmlparser.tcl + set options {-elementstartcommand -elementendcommand \ + -characterdatacommand -processinginstructioncommand \ + -externalentitycommand -xmldeclcommand \ + -doctypecommand -commentcommand \ + -entitydeclcommand -unparsedentitydeclcommand \ + -parameterentitydeclcommand -notationdeclcommand \ + -elementdeclcommand -attlistdeclcommand \ + -paramentityparsing -defaultexpandinternalentities \ + -startdoctypedeclcommand -enddoctypedeclcommand \ + -entityreferencecommand -warningcommand \ + -defaultcommand -unknownencodingcommand -notstandalonecommand \ + -startcdatasectioncommand -endcdatasectioncommand \ + -errorcommand -final \ + -validate -baseuri -baseurl \ + -name -cmd -emptyelement \ + -parseattributelistcommand -parseentitydeclcommand \ + -normalize -internaldtd -dtdsubset \ + -reportempty -ignorewhitespace \ + -reportempty \ + } + set usage [join $options ", "] + regsub -all -- - $options {} options + set pat ^-([join $options |])$ + foreach {flag value} $args { + if {[regexp $pat $flag]} { + # Validate numbers + if {[info exists parser($flag)] && \ + [string is integer -strict $parser($flag)] && \ + ![string is integer -strict $value]} { + return -code error "Bad value for $flag ($value), must be integer" + } + set parser($flag) $value + } else { + return -code error "Unknown option $flag, can be: $usage" + } + } + + # Backward-compatibility: -baseuri is a synonym for -baseurl + catch {set parser(-baseuri) $parser(-baseurl)} + + return {} +} + +# xml::tclparser::parse -- +# +# Parses document instance data +# +# Arguments: +# name parser object +# xml data +# args configuration options +# +# Results: +# Callbacks are invoked + +proc xml::tclparser::parse {name xml args} { + + array set options $args + upvar \#0 [namespace current]::$name parser + variable tokExpr + variable substExpr + + # Mats: + if {[llength $args]} { + eval {configure $name} $args + } + + set parseOptions [list \ + -emptyelement [namespace code ParseEmpty] \ + -parseattributelistcommand [namespace code ParseAttrs] \ + -parseentitydeclcommand [namespace code ParseEntity] \ + -normalize 0] + eval lappend parseOptions \ + [array get parser -*command] \ + [array get parser -reportempty] \ + [array get parser -ignorewhitespace] \ + [array get parser -name] \ + [array get parser -cmd] \ + [array get parser -baseuri] \ + [array get parser -validate] \ + [array get parser -final] \ + [array get parser -defaultexpandinternalentities] \ + [array get parser entities] \ + [array get parser extentities] \ + [array get parser parameterentities] \ + [array get parser externalparameterentities] \ + [array get parser elementdecls] \ + [array get parser attlistdecls] \ + [array get parser notationdecls] + + # Mats: + # If -final 0 we also need to maintain the state with a -statevariable ! + if {!$parser(-final)} { + eval lappend parseOptions [array get parser -statevariable] + } + + set dtdsubset no + catch {set dtdsubset $options(-dtdsubset)} + switch -- $dtdsubset { + internal { + # Bypass normal parsing + lappend parseOptions -statevariable $parser(-statevariable) + array set intOptions [array get ::sgml::StdOptions] + array set intOptions $parseOptions + ::sgml::ParseDTD:Internal [array get intOptions] $xml + return {} + } + external { + # Bypass normal parsing + lappend parseOptions -statevariable $parser(-statevariable) + array set intOptions [array get ::sgml::StdOptions] + array set intOptions $parseOptions + ::sgml::ParseDTD:External [array get intOptions] $xml + return {} + } + default { + # Pass through to normal processing + } + } + + lappend tokenOptions \ + -internaldtdvariable [namespace current]::${name}(internaldtd) + + # Mats: If -final 0 we also need to maintain the state with a -statevariable ! + if {!$parser(-final)} { + eval lappend tokenOptions [array get parser -statevariable] \ + [array get parser -final] + } + + # Mats: + # Why not the first four? Just padding? Lrange undos \n interp. + # It is necessary to have the first four as well if chopped off in + # middle of pcdata. + set tokenised [lrange \ + [eval {::sgml::tokenise $xml $tokExpr $substExpr} $tokenOptions] \ + 0 end] + + lappend parseOptions -internaldtd [list $parser(internaldtd)] + eval ::sgml::parseEvent [list $tokenised] $parseOptions + + return {} +} + +# xml::tclparser::ParseEmpty -- Tcl 8.1+ version +# +# Used by parser to determine whether an element is empty. +# This is usually dead easy in XML, but as always not quite. +# Have to watch out for empty element syntax +# +# Arguments: +# tag element name +# attr attribute list (raw) +# e End tag delimiter. +# +# Results: +# Return value of e + +proc xml::tclparser::ParseEmpty {tag attr e} { + switch -glob [string length $e],[regexp "/[::xml::cl $::xml::Wsp]*$" $attr] { + 0,0 { + return {} + } + 0,* { + return / + } + default { + return $e + } + } +} + +# xml::tclparser::ParseAttrs -- Tcl 8.1+ version +# +# Parse element attributes. +# +# There are two forms for name-value pairs: +# +# name="value" +# name='value' +# +# Arguments: +# opts parser options +# attrs attribute string given in a tag +# +# Results: +# Returns a Tcl list representing the name-value pairs in the +# attribute string +# +# A ">" occurring in the attribute list causes problems when parsing +# the XML. This manifests itself by an unterminated attribute value +# and a ">" appearing the element text. +# In this case return a three element list; +# the message "unterminated attribute value", the attribute list it +# did manage to parse and the remainder of the attribute list. + +proc xml::tclparser::ParseAttrs {opts attrs} { + + set result {} + + while {[string length [string trim $attrs]]} { + if {[regexp [::sgml::cl $::xml::Wsp]*($::xml::Name)[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')([::sgml::cl ^<]*?)\\2(.*) $attrs discard attrName delimiter value attrs]} { + lappend result $attrName [NormalizeAttValue $opts $value] + } elseif {[regexp [::sgml::cl $::xml::Wsp]*$::xml::Name[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')[::sgml::cl ^<]*\$ $attrs]} { + return -code error [list {unterminated attribute value} $result $attrs] + } else { + return -code error "invalid attribute list" + } + } + + return $result +} + +# xml::tclparser::NormalizeAttValue -- +# +# Perform attribute value normalisation. This involves: +# . character references are appended to the value +# . entity references are recursively processed and replacement value appended +# . whitespace characters cause a space to be appended +# . other characters appended as-is +# +# Arguments: +# opts parser options +# value unparsed attribute value +# +# Results: +# Normalised value returned. + +proc xml::tclparser::NormalizeAttValue {opts value} { + + # sgmlparser already has backslashes protected + # Protect Tcl specials + regsub -all {([][$])} $value {\\\1} value + + # Deal with white space + regsub -all "\[$::xml::Wsp\]" $value { } value + + # Find entity refs + regsub -all {&([^;]+);} $value {[NormalizeAttValue:DeRef $opts {\1}]} value + + return [subst $value] +} + +# xml::tclparser::NormalizeAttValue:DeRef -- +# +# Handler to normalize attribute values +# +# Arguments: +# opts parser options +# ref entity reference +# +# Results: +# Returns character + +proc xml::tclparser::NormalizeAttValue:DeRef {opts ref} { + # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap + switch -glob -- $ref { + {#x*} { + scan [string range $ref 2 end] %x value + set char [format %c $value] + # Check that the char is legal for XML + if {[regexp [format {^[%s]$} $::xml::Char] $char]} { + return $char + } else { + return -code error "illegal character" + } + } + {#*} { + scan [string range $ref 1 end] %d value + set char [format %c $value] + # Check that the char is legal for XML + if {[regexp [format {^[%s]$} $::xml::Char] $char]} { + return $char + } else { + return -code error "illegal character" + } + } + lt - + gt - + amp - + quot - + apos { + array set map {lt < gt > amp & quot \" apos '} + return $map($ref) + } + default { + # A general entity. Must resolve to a text value - no element structure. + + array set options $opts + upvar #0 $options(entities) map + + if {[info exists map($ref)]} { + + if {[regexp < $map($ref)]} { + return -code error "illegal character \"<\" in attribute value" + } + + if {![regexp & $map($ref)]} { + # Simple text replacement + return $map($ref) + } + + # There are entity references in the replacement text. + # Can't use child entity parser since must catch element structures + + return [NormalizeAttValue $opts $map($ref)] + + } elseif {[string compare $options(-entityreferencecommand) "::sgml::noop"]} { + + set result [uplevel #0 $options(-entityreferencecommand) [list $ref]] + + return $result + + } else { + return -code error "unable to resolve entity reference \"$ref\"" + } + } + } +} + +# xml::tclparser::ParseEntity -- +# +# Parse general entity declaration +# +# Arguments: +# data text to parse +# +# Results: +# Tcl list containing entity declaration + +proc xml::tclparser::ParseEntity data { + set data [string trim $data] + if {[regexp $::sgml::ExternalEntityExpr $data discard type delimiter1 id1 discard delimiter2 id2 optNDATA ndata]} { + switch $type { + PUBLIC { + return [list external $id2 $id1 $ndata] + } + SYSTEM { + return [list external $id1 {} $ndata] + } + } + } elseif {[regexp {^("|')(.*?)\1$} $data discard delimiter value]} { + return [list internal $value] + } else { + return -code error "badly formed entity declaration" + } +} + +# xml::tclparser::delete -- +# +# Destroy parser data +# +# Arguments: +# name parser object +# +# Results: +# Parser data structure destroyed + +proc xml::tclparser::delete name { + upvar \#0 [namespace current]::$name parser + catch {::sgml::ParserDelete $parser(-statevariable)} + catch {unset parser} + return {} +} + +# xml::tclparser::get -- +# +# Retrieve additional information from the parser +# +# Arguments: +# name parser object +# method info to retrieve +# args additional arguments for method +# +# Results: +# Depends on method + +proc xml::tclparser::get {name method args} { + upvar #0 [namespace current]::$name parser + + switch -- $method { + + elementdecl { + switch [llength $args] { + + 0 { + # Return all element declarations + upvar #0 $parser(elementdecls) elements + return [array get elements] + } + + 1 { + # Return specific element declaration + upvar #0 $parser(elementdecls) elements + if {[info exists elements([lindex $args 0])]} { + return [array get elements [lindex $args 0]] + } else { + return -code error "element \"[lindex $args 0]\" not declared" + } + } + + default { + return -code error "wrong number of arguments: should be \"elementdecl ?element?\"" + } + } + } + + attlist { + if {[llength $args] != 1} { + return -code error "wrong number of arguments: should be \"get attlist element\"" + } + + upvar #0 $parser(attlistdecls) + + return {} + } + + entitydecl { + } + + parameterentitydecl { + } + + notationdecl { + } + + default { + return -code error "unknown method \"$method\"" + } + } + + return {} +} + +# xml::tclparser::ExternalEntity -- +# +# Resolve and parse external entity +# +# Arguments: +# name parser object +# base base URL +# sys system identifier +# pub public identifier +# +# Results: +# External entity is fetched and parsed + +proc xml::tclparser::ExternalEntity {name base sys pub} { +} + +# xml::tclparser:: -- +# +# Reset a parser instance, ready to parse another document +# +# Arguments: +# name parser object +# +# Results: +# Variables unset + +proc xml::tclparser::reset {name} { + upvar \#0 [namespace current]::$name parser + + # Has this parser object been properly initialised? + if {![info exists parser] || \ + ![info exists parser(-name)]} { + return [create $name] + } + + array set parser { + -final 1 + depth 0 + leftover {} + } + + foreach var {Entities ExtEntities PEntities ExtPEntities ElDecls AttlistDecls NotDecls} { + catch {unset [namespace current]::${var}$name} + } + + # Initialise entities with predefined set + array set [namespace current]::Entities$name [array get ::sgml::EntityPredef] + + return {} +} |