diff options
Diffstat (limited to 'tcllib/modules/doctools2base/tcl_parse.tcl')
-rw-r--r-- | tcllib/modules/doctools2base/tcl_parse.tcl | 800 |
1 files changed, 800 insertions, 0 deletions
diff --git a/tcllib/modules/doctools2base/tcl_parse.tcl b/tcllib/modules/doctools2base/tcl_parse.tcl new file mode 100644 index 0000000..59cfd00 --- /dev/null +++ b/tcllib/modules/doctools2base/tcl_parse.tcl @@ -0,0 +1,800 @@ +# -*- tcl -*- +# Copyright (c) 2009 Andreas Kupries <andreas_kupries@sourceforge.net> + +# Recursive descent parser for Tcl commands embedded in a string. (=> +# subst -novariables, without actual evaluation of the embedded +# commands). Useful for processing templates, etc. The result is an +# abstract syntax tree of strings and commands, which in turn have +# strings and commands as arguments. + +# The tree can be processed further. The nodes of the tree are +# annotated with line/column/offset information to allow later stages +# the reporting of higher-level syntax and semantic errors with exact +# locations in the input. + +# TODO :: Add ability to report progress through the +# TODO :: input. Callback. Invoked in 'Initialize', 'Step', and +# TODO :: 'Finalize'. + +# TODO :: Investigate possibility of using tclparser package +# TODO :: ('parser') to handle the command pieces embedded in the +# TODO :: text. + +# # ## ### ##### ######## ############# ##################### +## Requirements + +package require Tcl 8.4 ; # Required runtime. +package require snit ; # OO system. +package require fileutil ; # File utilities. +package require logger ; # User feedback. +package require struct::list ; # Higher-order list operations. +package require struct::stack ; # Stacks +package require struct::set ; # Finite sets +package require treeql ; # Tree queries and transformation. + +# # ## ### ##### ######## ############# ##################### +## + +logger::initNamespace ::doctools::tcl::parse +snit::type ::doctools::tcl::parse { + # # ## ### ##### ######## ############# + ## Public API + + typemethod file {t path {root {}}} { + $type text $t [fileutil::cat -translation binary -encoding binary $path] $root + } + + typemethod text {t text {root {}}} { + # --- --- --- --------- --------- --------- + # Phase 1. Lexical processing. + # The resulting tree contains the raw tokens. See + # below for the specification of the resulting tree + # structure. + # + # This part is a recursive descent parser using Tcl's 12 rules + # for processing the input. Note: Variable references are not + # recognized, they are processed like regular text. + + Initialize $t $text $root + String + Finalize + + # Tree structure + # - All nodes but the root have the attributes 'type', 'range', 'line', and 'col'. + # + # * 'type' in { Command, Text, Backslash, Word, Quote, Continuation, QBrace } + # * 'range' is 2-element list (offset start, offset end) + # * 'line' is integer number > 0 + # * 'col' is integer number >= 0 + # + # 'type' specifies what sort of token the node contains. + # + # 'range' is the location of the token as offsets in + # characters from the beginning of the string, for + # first and last character in the token. EOL markers + # count as one character. This can be empty. + # + # 'line', 'col' are the location of the first character + # AFTER the token, as the line and column the character is + # on and at. + # + # Meaning of the various node types + # + # Command .... : A command begins here, the text in the range + # .............. is the opening bracket. + # Text ....... : A text segment in a word, anything up to the + # .............. beginning of a backslash sequence or of an + # .............. embedded command. + # Backslash .. : A backslash sequence. The text under the + # .............. range is the whole sequence. + # Word ....... : The beginning of an unquoted, quoted or + # .............. braced word. The text under the range is the + # .............. opening quote or brace, if any. The range is + # .............. empty for an unquoted word. + # Quote ...... : An embedded double-quote character which is + # .............. not the end of a quoted string (a special + # .............. type of backslash sequence). The range is the + # .............. whole sequence. + # Continuation : A continuation line in an unquoted, quoted, + # .............. or braced string. The range covers the whole + # .............. sequence, including the whitespace trailing + # .............. it. + # QBrace ..... : A quoted brace in a braced string. A special + # .............. kind of backslash sequence. The range covers + # .............. the whole sequence. + + # --- --- --- --------- --------- --------- + # Phase 2. Convert the token tree into a syntax tree. + # This phase simplifies the tree by converting and + # eliminating special tokens, and further decouples + # it from the input by storing the relevant string + # ranges of the input in the tree. For the the + # specification of the resulting structure see method + # 'Verify'. + # + # The sub-phases are and do + # + # (a) Extract the string information from the input and store + # them in their Text tokens. + # (b) Convert the special tokens (QBrace, Backslash, Quote, + # Continuation) into equivalent 'Text' tokens, with proper + # string information. + # (c) Merge adjacent 'Text' tokens. + # (d) Remove irrelevant 'Word' tokens. These are tokens with a + # single Text token as child. Word tokens without children + # however represent empty strings. They are converted into + # an equivalent Text node instead. + # (e) Pull the first word of commands into the command token, + # and ensure that it is not dynamic, i.e not an embedded + # command. + + ShowTree $t "Raw tree" + + set q [treeql %AUTO% -tree $t] + + # (a) + foreach n [$q query tree withatt type Text] { + struct::list assign [$t get $n range] a e + #$t unset $n range + $t set $n text [string range $mydata $a $e] + } + ShowTree $t "Text annotation" + + # (b1) + foreach n [$q query tree withatt type QBrace] { + struct::list assign [$t get $n range] a e + incr a ; # Skip backslash + #$t unset $n range + $t set $n text [string range $mydata $a $e] + $t set $n type Text + } + ShowTree $t "Special conversion 1, quoted braces" + + # (b2) + foreach n [$q query tree withatt type Backslash] { + struct::list assign [$t get $n range] a e + #$t unset $n range + $t set $n text [subst -nocommands -novariables [string range $mydata $a $e]] + + #puts <'[string range $mydata $a $e]'> + #puts _'[subst -nocommands -novariables [string range $mydata $a $e]]'_ + + $t set $n type Text + } + ShowTree $t "Special conversion 2, backslash sequences" + + # (b3) + foreach n [$q query tree withatt type Quote] { + #$t unset $n range + $t set $n text "\"" + $t set $n type Text + } + ShowTree $t "Special conversion 3, quoted double quotes" + + # (b4) + foreach n [$q query tree withatt type Continuation] { + #$t unset $n range + $t set $n text { } + $t set $n type Text + } + ShowTree $t "Special conversion 4, continuation lines" + + # (c) + foreach n [$q query tree withatt type Text right withatt type Text] { + set left [$t previous $n] + $t append $left text [$t get $n text] + + # Extend covered range. Copy location. + struct::list assign [$t get $left range] a _ + struct::list assign [$t get $n range] _ e + $t set $left range [list $a $e] + $t set $left line [$t get $n line] + $t set $left col [$t get $n col] + + $t delete $n + } + ShowTree $t "Merged adjacent texts" + + # (d) + foreach n [$q query tree withatt type Word] { + if {![$t numchildren $n]} { + $t set $n type Text + $t set $n text {} + } elseif {[$t numchildren $n] == 1} { + $t cut $n + } + } + ShowTree $t "Dropped simple words" + + # (e) + foreach n [$q query tree withatt type Command] { + set first [lindex [$t children $n] 0] + if {[$t get $first type] eq "Word"} { + error {Dynamic command name} + } + $t set $n text [$t get $first text] + $t set $n range [$t get $first range] + $t set $n line [$t get $first line] + $t set $n col [$t get $first col] + $t delete $first + } + ShowTree $t "Command lifting" + + $q destroy + + Verify $t + return + } + + proc Verify {t} { + # Tree structure ... + # Attributes Values + # - type string in {'Command','Text','Word'} (phase 2) + # - range 2-tuple (integer, integer), can be empty. start and end offset of the word in the input string. + # - line integer, line the node starts on. First line is 1 + # - col integer, column the node starts on (#char since start of line, first char is 0) + # Constraints + # .(i) The root node has no attributes at all. + # .(ii) The children of the root are Command and Text nodes in semi-alternation. + # I.e.: After a Text node a Command has to follow. + # After a Command node either Text or Command can follow. + # .(iii) The children of a Command node are Text, Word, and Command nodes, the command arguments. If any. + # .(iv) The children of a Word node are Command and Text nodes in semi-alternation. + # .(v) All Text nodes are leafs. + # .(vi) Any Command node can be a leaf. + # .(vii) Word nodes cannot be leafs. + # .(viii) All non-root nodes have the attributes 'type', 'range', 'col', and 'line'. + + foreach n [$t nodes] { + if {[$t parent $n] eq ""} { + # (ii) + set last {} + foreach c [$t children $n] { + set type [$t get $c type] + if {![struct::set contains {Command Text} $type]} { + return -code error "$c :: Bad node type $type in child of root node" + } elseif {($type eq $last) && ($last eq "Text")} { + return -code error "$c :: Bad node $type, not semi-alternating" + } + set last $type + } + # (i) + if {[llength [$t getall $n]]} { + return -code error "$n :: Bad root node, has attributes, should not" + } + continue + } else { + # (viii) + foreach k {range line col} { + if {![$t keyexists $n $k]} { + return -code error "$n :: Bad node, attribute '$k' missing" + } + } + } + set type [$t get $n type] + switch -exact -- $type { + Command { + # (vi) + # No need to check children. May have some or not, + # and no specific sequence is required. + } + Word { + # (vii) + if {![llength [$t children $n]]} { + return -code error "$n :: Bad word node is leaf" + } + # (iv) + set last {} + foreach c [$t children $n] { + set type [$t get $c type] + if {![struct::set contains {Command Text} $type]} { + return -code error "$n :: Bad node type $type in word node" + } elseif {($type eq $last) && ($last eq "Text")} { + return -code error "$c :: Bad node $type, not semi-alternating" + } + set last $type + } + } + Text { + # (v) + if {[llength [$t children $n]]} { + return -code error "$n :: Bad text node is not leaf" + } + } + default { + # (iii) + return -code error "$n :: Bad node type $type" + } + } + } + return + } + + # # ## ### ##### ######## ############# + ## Internal methods, lexical processing + + proc String {} { + while 1 { + Note @String + if {[EOF]} break + if {[Command]} continue + if {[TextSegment]} continue + if {[Backslash]} continue + + Stop ;# Unexpected character + } + Note @EOF + return + } + + proc Command {} { + # A command starts with an opening bracket. + Note ?Command + if {![Match "\\A(\\\[)" range]} { + Note \t%No-Command + return 0 + } + Note !Command + + PushRoot [Node Command $range] + while {[Word]} { + # Step over any whitespace after the last word + Whitespace + # Command ends at the closing bracket + if {[Match "\\A(\\])" range]} break + if {![EOF]} continue + + Stop ;# Unexpected end of input + } + + Note !CommandStop + PopRoot + return 1 + } + + proc TextSegment {} { + # A text segment is anything up to a command start or start of + # a back slash sequence. + Note ?TextSegment + if {![Match "\\A(\[^\\\[\]+)" range]} { + Note \t%No-TextSegment + return 0 + } + Note !TextSegment + Node Text $range + return 1 + } + + proc TextSegmentWithoutQuote {} { + Note ?TextSegmentWithoutQuote + # A text segment without quote is anything up to a command + # start or start of a back slash sequence, or a double-quote + # character. + if {![Match "\\A(\[^\"\\\\\[\]+)" range]} { + Note \t%No-TextSegmentWithoutQuote + return 0 + } + Note !TextSegment + Node Text $range + return 1 + } + + proc Backslash {} { + Note ?Backslash + if { + ![Match "\\A(\\\\x\[a-fA-F0-9\]+)" range] && + ![Match "\\A(\\\\u\[a-fA-F0-9\]{1,4})" range] && + ![Match "\\A(\\\\\[0-2\]\[0-7\]{2})" range] && + ![Match "\\A(\\\\\[0-7\]{1,2})" range] && + ![Match {\A(\\[abfnrtv])} range] + } { + Note \t%No-Backslash + return 0 + } + Note !Backslash + Node Backslash $range + return 1 + } + + proc Word {} { + Note ?Word + if {[QuotedWord]} {return 1} + if {[BracedWord 0]} {return 1} + return [UnquotedWord] + } + + proc Whitespace {} { + Note ?Whitespace + if {![Match {\A([ \t]|(\\\n[ \t]*))+} range]} { + Note \t%No-Whitespace + return 0 + } + Note !Whitespace + return 1 + } + + proc QuotedWord {} { + # A quoted word starts with a double quote. + Note ?QuotedWord + if {![Match "\\A(\")" range]} { + Note \t%No-QuotedWord + return 0 + } + Note !QuotedWord + PushRoot [Node Word $range] + QuotedString + PopRoot + return 1 + } + + proc BracedWord {keepclose} { + # A braced word starts with an opening brace. + Note ?BracedWord/$keepclose + if {![Match "\\A(\{)" range]} { + Note \t%No-BracedWord/$keepclose + return 0 + } + Note !BracedWord/$keepclose + PushRoot [Node Word $range] + BracedString $keepclose + PopRoot + return 1 + } + + proc UnquotedWord {} { + Note !UnquotedWord + PushRoot [Node Word {}] + UnquotedString + PopRoot + return 1 + } + + proc QuotedString {} { + Note !QuotedString + while 1 { + Note !QuotedStringPart + # A quoted word (and thus the embedded string) ends with + # double quote. + if {[Match "\\A(\")" range]} { + return + } + # Now try to match possible pieces of the string. This is + # a repetition of the code in 'String', except for the + # different end condition above, and the possible embedded + # double quotes and continuation lines the outer string + # can ignore. + if {[Command]} continue + if {[Quote]} continue + if {[QuotedBraces]} continue + if {[Continuation]} continue + if {[Backslash]} continue + # Check after backslash recognition and processing + if {[TextSegmentWithoutQuote]} continue + + Stop ;# Unexpected character or end of input + } + return + } + + proc BracedString {keepclose} { + while 1 { + Note !BracedStringPart + # Closing brace encountered. keepclose is set if we are in + # a nested braced string. Only then do we have to put the + # brace as a regular text piece into the string + if {[Match "\\A(\})" range]} { + if {$keepclose} { + Node Text $range + } + return + } + # Special sequences. + if {[QuotedBraces]} continue + if {[Continuation]} continue + if {[BracedWord 1]} continue + # A backslash without a brace coming after is regular a + # character. + if {[Match {\A(\\)} range]} { + Node Text $range + continue + } + # Gooble sequence of regular characters. Stops at + # backslash and braces. Backslash stop is needed to handle + # the case of them starting a quoted brace. + if {[Match {\A([^\\\{\}]*)} range]} { + Node Text $range + continue + } + Stop ;# Unexpected character or end of input. + } + } + + proc UnquotedString {} { + while 1 { + Note !UnquotedStringPart + # Stop conditions + # - end of string + # - whitespace + # - Closing bracket (end of command the word is in) + if {[EOF]} return + if {[Whitespace]} return + if {[Peek "\\A(\\\])" range]} return + + # Match each possible type of part + if {[Command]} continue + if {[Quote]} continue + if {[Continuation]} continue + if {[Backslash]} continue + # Last, capture backslash sequences first. + if {[UnquotedTextSegment]} continue + + Stop ;# Unexpected character or end of input. + } + return + } + + proc UnquotedTextSegment {} { + # All chars but whitespace and brackets (start or end of + # command). + Note ?UnquotedTextSegment + if {![Match {\A([^\]\[\t\n ]+)} range]} { + Note \t%No-UnquotedTextSegment + return 0 + } + Note !UnquotedTextSegment + Node Text $range + return 1 + } + + proc Quote {} { + Note ?EmdeddedQuote + if {![Match "\\A(\\\")" range]} { + Note \t%No-EmdeddedQuote + return 0 + } + # Embedded double quote, not the end of the quoted string. + Note !EmdeddedQuote + Node Quote $range + return 1 + } + + proc Continuation {} { + Note ?ContinuationLine + if {![Match "\\A(\\\\\n\[ \t\]*)" range]} { + Note \t%No-ContinuationLine + return 0 + } + Note !ContinuationLine + Node Continuation $range + return 1 + } + + proc QuotedBraces {} { + Note ?QuotedBrace + if { + ![Match "\\A(\\\\\{)" range] && + ![Match "\\A(\\\\\})" range] + } { + Note \t%No-QuotedBrace + return 0 + } + Note !QuotedBrace + Node QBrace $range + return 1 + } + + # # ## ### ##### ######## ############# + ## Tree construction helper commands. + + proc Node {what range} { + set n [lindex [$mytree insert $myroot end] 0] + + Note "+\tNode $n @ $myroot $what" + + $mytree set $n type $what + $mytree set $n range $range + $mytree set $n line $myline + $mytree set $n col $mycol + + return $n + } + + proc PushRoot {x} { + Note "Push Root = $x" + $myrootstack push $myroot + set myroot $x + return + } + + proc PopRoot {} { + set myroot [$myrootstack pop] + Note "Pop Root = $myroot" + return + } + + # # ## ### ##### ######## ############# + ## Error reporting + + proc Stop {} { + ::variable myerr + set ahead [string range $mydata $mypos [expr {$mypos + 30}]] + set err [expr {![string length $ahead] ? "eof" : "char"}] + set ahead [string map [list \n \\n \t \\t \r \\r] [string range $ahead 0 0]] + set caller [lindex [info level -1] 0] + set msg "[format $myerr($err) $ahead $caller] at line ${myline}.$mycol" + set err [list doctools::tcl::parse $err $mypos $myline $mycol] + + return -code error -errorcode $err $msg + } + + # # ## ### ##### ######## ############# + ## Input processing. Match/peek lexemes, update location after + ## stepping over a range. Match = Peek + Step. + + proc EOF {} { + Note "?EOF($mypos >= $mysize) = [expr {$mypos >= $mysize}]" + return [expr {$mypos >= $mysize}] + } + + proc Match {pattern rv} { + upvar 1 $rv range + set ok [Peek $pattern range] + if {$ok} {Step $range} + return $ok + } + + proc Peek {pattern rv} { + upvar 1 $rv range + + Note Peek($pattern)----|[string map [list "\n" "\\n" "\t" "\\t"] [string range $mydata $mypos [expr {$mypos + 30}]]]| + + if {[regexp -start $mypos -indices -- $pattern $mydata -> range]} { + Note \tOK + return 1 + } else { + Note \tFAIL + return 0 + } + } + + proc Step {range} { + struct::list assign $range a e + + set mylastpos $mypos + + set mypos $e + incr mypos + + set pieces [split [string range $mydata $a $e] \n] + set delta [string length [lindex $pieces end]] + set nlines [expr {[llength $pieces] - 1}] + + if {$nlines} { + incr myline $nlines + set mycol $delta + } else { + incr mycol $delta + } + return + } + + # # ## ### ##### ######## ############# + ## Setup / Shutdown of parser/lexer + + proc Initialize {t text root} { + set mytree $t + if {$root eq {}} { + set myroot [$t rootname] + } else { + set myroot $root + } + + if {$myrootstack ne {}} Finalize + set myrootstack [struct::stack %AUTO%] + $myrootstack clear + + set mydata $text + set mysize [string length $mydata] + + set mypos 0 + set myline 1 + set mycol 0 + return + } + + proc Finalize {} { + $myrootstack destroy + set myrootstack {} + return + } + + # # ## ### ##### ######## ############# + ## Debugging helper commands + ## Add ability to disable these. + ## For the tree maybe add ability to dump through a callback ? + + proc Note {text} { + upvar 1 range range + set m {} + append m "$text " + if {[info exists range]} { + append m "($range) " + if {$range != {}} { + foreach {a e} $range break + append m " = \"[string map [list "\n" "\\n" "\t" "\\t"] \ + [string range $mydata $a $e]]\"" + } + } else { + append m "@$mypos ($myline/$mycol)" + } + #log::debug $m + puts $m + return + } + + #proc ShowTreeX {args} {} + proc ShowTreeX {t x} { + puts "=== \[ $x \] [string repeat = [expr {72 - [string length $x] - 9}]]" + $t walk root -order pre -type dfs n { + set prefix [string repeat .... [$t depth $n]] + puts "$prefix$n <[DictSort [$t getall $n]]>" + } + return + } + + proc Note {args} {} + proc ShowTree {args} {} + + # # ## ### ##### ######## ############# + + proc DictSort {dict} { + array set tmp $dict + set res {} + foreach k [lsort -dict [array names tmp]] { + lappend res $k $tmp($k) + } + return $res + } + + # # ## ### ##### ######## ############# + ## Parser state + + typevariable mytree {} ; # Tree we are working on + typevariable myroot {} ; # Current root to add nodes to. + typevariable myrootstack {} + + typevariable mydata {} ; # String to parse. + typevariable mysize 0 ; # Length of string to parse, cache + + typevariable mylastpos ; # Last current position. + typevariable mypos 0 ; # Current parse location, offset from + typevariable myline 1 ; # the beginning of the string, line + typevariable mycol 0 ; # we are on, and the column within the + # line. + + typevariable myerr -array { + char {Unexpected character '%1$s' in %2$s} + eof {Unexpected end of input in %2$s} + } + + + # # ## ### ##### ######## ############# + ## Configuration + + pragma -hasinstances no ; # singleton + pragma -hastypeinfo no ; # no introspection + pragma -hastypedestroy no ; # immortal + + ## + # # ## ### ##### ######## ############# +} + +namespace eval ::doctools::tcl { + namespace export parse +} + +# # ## ### ##### ######## ############# ##################### +## Ready + +package provide doctools::tcl::parse 0.1 +return |