summaryrefslogtreecommitdiffstats
path: root/tclxml-tcl/tclparser-8.1.tcl
blob: 40a0af9829c0b4a120bd6b43f2fbe5ede0334aeb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
# tclparser-8.1.tcl --
#
#	This file provides a Tcl implementation of a XML parser.
#	This file supports Tcl 8.1.
#
#	See xml-8.[01].tcl for definitions of character sets and
#	regular expressions.
#
# Copyright (c) 2005-2008 by Explain.
# http://www.explain.com.au/
# Copyright (c) 1998-2003 Zveno Pty Ltd
# http://www.zveno.com/
# 
# See the file "LICENSE" in this distribution for information on usage and
# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# $Id: tclparser-8.1.tcl,v 1.1.1.1 2009/01/16 22:11:49 joye Exp $

package require Tcl 8.1

package provide xml::tclparser 3.2

package require xmldefs 3.2

package require sgmlparser 1.0

namespace eval xml::tclparser {

    namespace export create createexternal externalentity parse configure get delete

    # Tokenising expressions

    variable tokExpr $::xml::tokExpr
    variable substExpr $::xml::substExpr

    # Register this parser class

    ::xml::parserclass create tcl \
	    -createcommand [namespace code create] \
	    -createentityparsercommand [namespace code createentityparser] \
	    -parsecommand [namespace code parse] \
	    -configurecommand [namespace code configure] \
	    -deletecommand [namespace code delete] \
	    -resetcommand [namespace code reset]
}

# xml::tclparser::create --
#
#	Creates XML parser object.
#
# Arguments:
#	name	unique identifier for this instance
#
# Results:
#	The state variable is initialised.

proc xml::tclparser::create name {

    # Initialise state variable
    upvar \#0 [namespace current]::$name parser
    array set parser [list -name $name			\
	-cmd [uplevel 3 namespace current]::$name	\
	-final 1					\
	-validate 0					\
	-statevariable [namespace current]::$name	\
	-baseuri {}					\
	internaldtd {}					\
	entities [namespace current]::Entities$name	\
	extentities [namespace current]::ExtEntities$name	\
	parameterentities [namespace current]::PEntities$name	\
	externalparameterentities [namespace current]::ExtPEntities$name	\
	elementdecls [namespace current]::ElDecls$name	\
	attlistdecls [namespace current]::AttlistDecls$name	\
	notationdecls [namespace current]::NotDecls$name	\
	depth 0						\
	leftover {}                                     \
    ]

    # Initialise entities with predefined set
    array set [namespace current]::Entities$name [array get ::sgml::EntityPredef]

    return $parser(-cmd)
}

# xml::tclparser::createentityparser --
#
#	Creates XML parser object for an entity.
#
# Arguments:
#	name	name for the new parser
#	parent	name of parent parser
#
# Results:
#	The state variable is initialised.

proc xml::tclparser::createentityparser {parent name} {
    upvar #0 [namespace current]::$parent p

    # Initialise state variable
    upvar \#0 [namespace current]::$name external
    array set external [array get p]

    regsub $parent $p(-cmd) {} parentns

    array set external [list -name $name		\
	-cmd $parentns$name				\
	-statevariable [namespace current]::$name	\
	internaldtd {}					\
	line 0						\
    ]
    incr external(depth)

    return $external(-cmd)
}

# xml::tclparser::configure --
#
#	Configures a XML parser object.
#
# Arguments:
#	name	unique identifier for this instance
#	args	option name/value pairs
#
# Results:
#	May change values of config options

proc xml::tclparser::configure {name args} {
    upvar \#0 [namespace current]::$name parser

    # BUG: very crude, no checks for illegal args
    # Mats: Should be synced with sgmlparser.tcl
    set options {-elementstartcommand -elementendcommand \
      -characterdatacommand -processinginstructioncommand \
      -externalentitycommand -xmldeclcommand \
      -doctypecommand -commentcommand \
      -entitydeclcommand -unparsedentitydeclcommand \
      -parameterentitydeclcommand -notationdeclcommand \
      -elementdeclcommand -attlistdeclcommand \
      -paramentityparsing -defaultexpandinternalentities \
      -startdoctypedeclcommand -enddoctypedeclcommand \
      -entityreferencecommand -warningcommand \
      -defaultcommand -unknownencodingcommand -notstandalonecommand \
      -startcdatasectioncommand -endcdatasectioncommand \
      -errorcommand -final \
      -validate -baseuri -baseurl \
      -name -cmd -emptyelement \
      -parseattributelistcommand -parseentitydeclcommand \
      -normalize -internaldtd -dtdsubset \
      -reportempty -ignorewhitespace \
      -reportempty \
    }
    set usage [join $options ", "]
    regsub -all -- - $options {} options
    set pat ^-([join $options |])$
    foreach {flag value} $args {
	if {[regexp $pat $flag]} {
	    # Validate numbers
	    if {[info exists parser($flag)] && \
		    [string is integer -strict $parser($flag)] && \
		    ![string is integer -strict $value]} {
		return -code error "Bad value for $flag ($value), must be integer"
	    }
	    set parser($flag) $value
	} else {
	    return -code error "Unknown option $flag, can be: $usage"
	}
    }

    # Backward-compatibility: -baseuri is a synonym for -baseurl
    catch {set parser(-baseuri) $parser(-baseurl)}

    return {}
}

# xml::tclparser::parse --
#
#	Parses document instance data
#
# Arguments:
#	name	parser object
#	xml	data
#	args	configuration options
#
# Results:
#	Callbacks are invoked

proc xml::tclparser::parse {name xml args} {

    array set options $args
    upvar \#0 [namespace current]::$name parser
    variable tokExpr
    variable substExpr

    # Mats:
    if {[llength $args]} {
	eval {configure $name} $args
    }

    set parseOptions [list \
	    -emptyelement [namespace code ParseEmpty] \
	    -parseattributelistcommand [namespace code ParseAttrs] \
	    -parseentitydeclcommand [namespace code ParseEntity] \
	    -normalize 0]
    eval lappend parseOptions \
	    [array get parser -*command] \
	    [array get parser -reportempty] \
	    [array get parser -ignorewhitespace] \
	    [array get parser -name] \
	    [array get parser -cmd] \
	    [array get parser -baseuri] \
	    [array get parser -validate] \
	    [array get parser -final] \
	    [array get parser -defaultexpandinternalentities] \
	    [array get parser entities] \
	    [array get parser extentities] \
	    [array get parser parameterentities] \
	    [array get parser externalparameterentities] \
	    [array get parser elementdecls] \
	    [array get parser attlistdecls] \
	    [array get parser notationdecls]

    # Mats:
    # If -final 0 we also need to maintain the state with a -statevariable !
    if {!$parser(-final)} {
	eval lappend parseOptions [array get parser -statevariable]
    }

    set dtdsubset no
    catch {set dtdsubset $options(-dtdsubset)}
    switch -- $dtdsubset {
	internal {
	    # Bypass normal parsing
	    lappend parseOptions -statevariable $parser(-statevariable)
	    array set intOptions [array get ::sgml::StdOptions]
	    array set intOptions $parseOptions
	    ::sgml::ParseDTD:Internal [array get intOptions] $xml
	    return {}
	}
	external {
	    # Bypass normal parsing
	    lappend parseOptions -statevariable $parser(-statevariable)
	    array set intOptions [array get ::sgml::StdOptions]
	    array set intOptions $parseOptions
	    ::sgml::ParseDTD:External [array get intOptions] $xml
	    return {}
	}
	default {
	    # Pass through to normal processing
	}
    }

    lappend tokenOptions  \
      -internaldtdvariable [namespace current]::${name}(internaldtd)
    
    # Mats: If -final 0 we also need to maintain the state with a -statevariable !
    if {!$parser(-final)} {
	eval lappend tokenOptions [array get parser -statevariable] \
	  [array get parser -final]
    }
    
    # Mats:
    # Why not the first four? Just padding? Lrange undos \n interp.
    # It is necessary to have the first four as well if chopped off in
    # middle of pcdata.
    set tokenised [lrange \
	    [eval {::sgml::tokenise $xml $tokExpr $substExpr} $tokenOptions] \
	0 end]

    lappend parseOptions -internaldtd [list $parser(internaldtd)]
    eval ::sgml::parseEvent [list $tokenised] $parseOptions

    return {}
}

# xml::tclparser::ParseEmpty --  Tcl 8.1+ version
#
#	Used by parser to determine whether an element is empty.
#	This is usually dead easy in XML, but as always not quite.
#	Have to watch out for empty element syntax
#
# Arguments:
#	tag	element name
#	attr	attribute list (raw)
#	e	End tag delimiter.
#
# Results:
#	Return value of e

proc xml::tclparser::ParseEmpty {tag attr e} {
    switch -glob [string length $e],[regexp "/[::xml::cl $::xml::Wsp]*$" $attr] {
	0,0 {
	    return {}
	}
	0,* {
	    return /
	}
	default {
	    return $e
	}
    }
}

# xml::tclparser::ParseAttrs -- Tcl 8.1+ version
#
#	Parse element attributes.
#
# There are two forms for name-value pairs:
#
#	name="value"
#	name='value'
#
# Arguments:
#	opts	parser options
#	attrs	attribute string given in a tag
#
# Results:
#	Returns a Tcl list representing the name-value pairs in the 
#	attribute string
#
#	A ">" occurring in the attribute list causes problems when parsing
#	the XML.  This manifests itself by an unterminated attribute value
#	and a ">" appearing the element text.
#	In this case return a three element list;
#	the message "unterminated attribute value", the attribute list it
#	did manage to parse and the remainder of the attribute list.

proc xml::tclparser::ParseAttrs {opts attrs} {

    set result {}

    while {[string length [string trim $attrs]]} {
	if {[regexp [::sgml::cl $::xml::Wsp]*($::xml::Name)[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')([::sgml::cl ^<]*?)\\2(.*) $attrs discard attrName delimiter value attrs]} {
	    lappend result $attrName [NormalizeAttValue $opts $value]
	} elseif {[regexp [::sgml::cl $::xml::Wsp]*$::xml::Name[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')[::sgml::cl ^<]*\$ $attrs]} {
	    return -code error [list {unterminated attribute value} $result $attrs]
	} else {
	    return -code error "invalid attribute list"
	}
    }

    return $result
}

# xml::tclparser::NormalizeAttValue --
#
#	Perform attribute value normalisation.  This involves:
#	. character references are appended to the value
#	. entity references are recursively processed and replacement value appended
#	. whitespace characters cause a space to be appended
#	. other characters appended as-is
#
# Arguments:
#	opts	parser options
#	value	unparsed attribute value
#
# Results:
#	Normalised value returned.

proc xml::tclparser::NormalizeAttValue {opts value} {

    # sgmlparser already has backslashes protected
    # Protect Tcl specials
    regsub -all {([][$])} $value {\\\1} value

    # Deal with white space
    regsub -all "\[$::xml::Wsp\]" $value { } value

    # Find entity refs
    regsub -all {&([^;]+);} $value {[NormalizeAttValue:DeRef $opts {\1}]} value

    return [subst $value]
}

# xml::tclparser::NormalizeAttValue:DeRef --
#
#	Handler to normalize attribute values
#
# Arguments:
#	opts	parser options
#	ref	entity reference
#
# Results:
#	Returns character

proc xml::tclparser::NormalizeAttValue:DeRef {opts ref} {
    # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap
    switch -glob -- $ref {
	{#x*} {
	    scan [string range $ref 2 end] %x value
	    set char [format %c $value]
	    # Check that the char is legal for XML
	    if {[regexp [format {^[%s]$} $::xml::Char] $char]} {
		return $char
	    } else {
		return -code error "illegal character"
	    }
	}
	{#*} {
	    scan [string range $ref 1 end] %d value
	    set char [format %c $value]
	    # Check that the char is legal for XML
	    if {[regexp [format {^[%s]$} $::xml::Char] $char]} {
		return $char
	    } else {
		return -code error "illegal character"
	    }
	}
	lt -
	gt -
	amp -
	quot -
	apos {
	    array set map {lt < gt > amp & quot \" apos '}
	    return $map($ref)
	}
	default {
	    # A general entity.  Must resolve to a text value - no element structure.

	    array set options $opts
	    upvar #0 $options(entities) map

	    if {[info exists map($ref)]} {

		if {[regexp < $map($ref)]} {
		    return -code error "illegal character \"<\" in attribute value"
		}

		if {![regexp & $map($ref)]} {
		    # Simple text replacement
		    return $map($ref)
		}

		# There are entity references in the replacement text.
		# Can't use child entity parser since must catch element structures

		return [NormalizeAttValue $opts $map($ref)]

	    } elseif {[string compare $options(-entityreferencecommand) "::sgml::noop"]} {

		set result [uplevel #0 $options(-entityreferencecommand) [list $ref]]

		return $result

	    } else {
		return -code error "unable to resolve entity reference \"$ref\""
	    }
	}
    }
}

# xml::tclparser::ParseEntity --
#
#	Parse general entity declaration
#
# Arguments:
#	data	text to parse
#
# Results:
#	Tcl list containing entity declaration

proc xml::tclparser::ParseEntity data {
    set data [string trim $data]
    if {[regexp $::sgml::ExternalEntityExpr $data discard type delimiter1 id1 discard delimiter2 id2 optNDATA ndata]} {
	switch $type {
	    PUBLIC {
		return [list external $id2 $id1 $ndata]
	    }
	    SYSTEM {
		return [list external $id1 {} $ndata]
	    }
	}
    } elseif {[regexp {^("|')(.*?)\1$} $data discard delimiter value]} {
	return [list internal $value]
    } else {
	return -code error "badly formed entity declaration"
    }
}

# xml::tclparser::delete --
#
#	Destroy parser data
#
# Arguments:
#	name	parser object
#
# Results:
#	Parser data structure destroyed

proc xml::tclparser::delete name {
    upvar \#0 [namespace current]::$name parser
    catch {::sgml::ParserDelete $parser(-statevariable)}
    catch {unset parser}
    return {}
}

# xml::tclparser::get --
#
#	Retrieve additional information from the parser
#
# Arguments:
#	name	parser object
#	method	info to retrieve
#	args	additional arguments for method
#
# Results:
#	Depends on method

proc xml::tclparser::get {name method args} {
    upvar #0 [namespace current]::$name parser

    switch -- $method {

	elementdecl {
	    switch [llength $args] {

		0 {
		    # Return all element declarations
		    upvar #0 $parser(elementdecls) elements
		    return [array get elements]
		}

		1 {
		    # Return specific element declaration
		    upvar #0 $parser(elementdecls) elements
		    if {[info exists elements([lindex $args 0])]} {
			return [array get elements [lindex $args 0]]
		    } else {
			return -code error "element \"[lindex $args 0]\" not declared"
		    }
		}

		default {
		    return -code error "wrong number of arguments: should be \"elementdecl ?element?\""
		}
	    }
	}

	attlist {
	    if {[llength $args] != 1} {
		return -code error "wrong number of arguments: should be \"get attlist element\""
	    }

	    upvar #0 $parser(attlistdecls)

	    return {}
	}

	entitydecl {
	}

	parameterentitydecl {
	}

	notationdecl {
	}

	default {
	    return -code error "unknown method \"$method\""
	}
    }

    return {}
}

# xml::tclparser::ExternalEntity --
#
#	Resolve and parse external entity
#
# Arguments:
#	name	parser object
#	base	base URL
#	sys	system identifier
#	pub	public identifier
#
# Results:
#	External entity is fetched and parsed

proc xml::tclparser::ExternalEntity {name base sys pub} {
}

# xml::tclparser:: --
#
#	Reset a parser instance, ready to parse another document
#
# Arguments:
#	name	parser object
#
# Results:
#	Variables unset

proc xml::tclparser::reset {name} {
    upvar \#0 [namespace current]::$name parser

    # Has this parser object been properly initialised?
    if {![info exists parser] || \
	    ![info exists parser(-name)]} {
	return [create $name]
    }

    array set parser {
	-final 1
	depth 0
	leftover {}
    }

    foreach var {Entities ExtEntities PEntities ExtPEntities ElDecls AttlistDecls NotDecls} {
	catch {unset [namespace current]::${var}$name}
    }

    # Initialise entities with predefined set
    array set [namespace current]::Entities$name [array get ::sgml::EntityPredef]

    return {}
}