#!/usr/bin/tclsh # $Id: taccle.tcl,v 1.6 2005/03/17 20:42:21 tang Exp $ set TACCLE_VERSION 1.1 #//# # Taccle is another compiler compiler written in pure Tcl. reads a # taccle specification file to generate pure Tcl code that # implements an LALR(1) parser. See the {@link README} file for # complete instructions. Additional information may be found at # {@link http://mini.net/tcl/taccle}. # # @author Jason Tang (tang@jtang.org) #//# # Process a definition on a single line, either a literal block or a # % directive. # # @param line text of a definition proc handle_defs {line} { # trim whitespace and remove any comments set line [strip_comments [string trim $line]] if {$line == ""} { return } if {$line == "%\{"} { handle_literal_block } else { # extract the keyword to the left of the first space and the # arguments (if any) to the right if {[regexp -line {^(\S+)\s+(.*)} $line foo keyword args] == 0} { set keyword $line set args "" } switch -- $keyword { "%token" { foreach token_name [split $args] { if {$token_name != ""} { # add the terminal token to the table add_token $token_name $::TERMINAL 0 0 nonassoc } } } "%left" - "%right" - "%nonassoc" { handle_precedence $::next_precedence [string range $keyword 1 end] $args incr ::next_precedence } "%start" { if {$args == ""} { taccle_error "Must supply a token with %start" $::PARAM_ERROR } set ::start_symbol $args } default { taccle_error "Unknown declaration \"$keyword\"" $::SYNTAX_ERROR } } } } # Start reading from the source file and copy everything between ^%\{$ # to ^%\}$ to the destination file. proc handle_literal_block {} { set end_defs 0 set lines_in_block 0 while {$end_defs == 0} { if {[gets $::src line] < 0} { taccle_error "No terminator to verbatim section found " $::SYNTAX_ERROR } elseif {[string trim $line] == "%\}"} { set end_defs 1 } else { puts $::dest $line } incr lines_in_block } incr ::line_count $lines_in_block } # Assigns operator precedence to each token in $tokens. Adds the # token as a TERMINAL to the token table. # # @param level integer value for token precedence # @param direction direction of precedence, either left, # right, or nonassoc # @param tokens list of terminals to which assign precedence proc handle_precedence {level direction tokens} { foreach token $tokens { if {[regexp -- {\A\'(.)\'\Z} $token foo c]} { add_token $c $::TERMINAL 1 $level $direction } else { add_token $token $::TERMINAL 0 $level $direction } } } # The nine steps to actually building a parser, given a string buffer # containing all of the rules. # # @param rules_buf a very large string consisting of all of the # grammar's rules proc build_parser {rules_buf} { # setp 0: parse the entire rules buffer into separate productions handle_rules_buf $rules_buf # step 1: rewrite the grammar, then augment it rewrite_grammar # step 2: determine which non-terminals are nullable generate_nullable_table # step 3: generate FIRST table for each element in the token table generate_first_table # step 4: now generate FOLLOW table for each element generate_follow_table # step 5: build canonical LR(1) table generate_lr1 # step 6: combine cores into LALR(1) table generate_lalr1 # step 7: wherever there exists a shift/reduce conflict, choose to # reduce wherever the precedence table dictates such resolve_precedences # step 8: check for infinite recursions check_recursions # step 9: finally take LALR(1) table and generate a state # transition matrix generate_lalr1_parse_table } # Parses the rules buffer, extracting each rule and adding # pseudo-rules wherever embedded actions exist. # # @param rules_buf remaining rules to handle proc handle_rules_buf {rules_buf} { # counts number of rules in the grammar # rule number 0 is reserved for the special augmentation S' -> S set ::rule_count 1 set prev_lhs "" # keep track of pseudo-rules (used for embedded actions) set pseudo_count 1 # add the special end marker set ::token_table("\$",t) $::TERMINAL set ::token_table("\$") 0 set ::token_id_table(0) "\$" set ::token_id_table(0,t) $::TERMINAL set ::prec_table(0) 0 set ::prec_table(0,dir) nonassoc # add the special error token add_token error $::TERMINAL 1 0 nonassoc while {[string length $rules_buf] > 0} { # consume blank lines if {[regexp -line -- {\A([[:blank:]]*\n)} $rules_buf foo blanks]} { set rules_buf [string range $rules_buf [string length $blanks] end] incr ::line_count continue } # extract left hand side if {[regexp -line -- {\A\s*(\w+)\s*:} $rules_buf foo lhs]} { add_token $lhs $::NONTERMINAL 0 0 nonassoc set prev_lhs $lhs } elseif {[regexp -line -- {\A\s*\|} $rules_buf foo]} { if {$prev_lhs == ""} { taccle_error "No previously declared left hand side" $::SYNTAX_ERROR } set lhs $prev_lhs } elseif {[regexp -line -- {\A\s*\Z} $rules_buf]} { # only whitespace left break } else { taccle_error "No left hand side found" $::SYNTAX_ERROR } set rules_buf [string range $rules_buf [string length $foo] end] # read the rule derivation, which is everything up to a bar or # semicolon set rhs "" set action "" set done_deriv 0 set num_lines 0 while {$rules_buf != "" && $done_deriv != 1} { switch -- [string index $rules_buf 0] { | { set done_deriv 1 } ; { set done_deriv 1 set prev_lhs "" set rules_buf [string range $rules_buf 1 end] } "\n" { incr num_lines append rhs " " set rules_buf [string range $rules_buf 1 end] } ' { append rhs [string range $rules_buf 0 2] set rules_buf [string range $rules_buf 3 end] } \{ { # keep scanning until end of action found set a "" set rp 1 set found_end 0 while {!$found_end && $rp < [string length $rules_buf]} { set c [string index $rules_buf $rp] if {$c == "\}"} { if {[info complete $a]} { set found_end 1 } else { append a "\}" } } elseif {$c == "\n"} { append a $c incr num_lines } else { append a $c } incr rp } if {!$found_end} { taccle_error "Unmatched `\{'" $::SYNTAX_ERROR } set action $a set rules_buf [string range $rules_buf $rp end] } default { set c [string index $rules_buf 0] if {$action != "" && ![string is space $c]} { # embedded action found; add a special rule for it set pseudo_name "@PSEUDO$pseudo_count" add_token $pseudo_name $::NONTERMINAL 0 0 nonassoc set ::rule_table($::rule_count,l) $pseudo_name set ::rule_table($::rule_count,d) "" set ::rule_table($::rule_count,dc) 0 set ::rule_table($::rule_count,a) $action set ::rule_table($::rule_count,e) 0 set ::rule_table($::rule_count,line) $::line_count append rhs "$pseudo_name " set action "" incr pseudo_count incr ::rule_count } else { append rhs $c set rules_buf [string range $rules_buf 1 end] } } } } if {$rules_buf == "" && $done_deriv == 0} { taccle_error "Rule does not terminate" $::SYNTAX_ERROR } set derivation [string trim $rhs] set deriv_list "" set deriv_count 0 set prec_next 0 foreach token [split $derivation] { if {$prec_next} { # check that argument to %prec is a terminal symbol if {![info exists ::token_table($token)] || \ $::token_table($token,t) != $::TERMINAL} { taccle_error "Argument to %prec is not a terminal symbol" $::GRAMMAR_ERROR } set ::rule_table($::rule_count,prec) $::token_table($token) set prec_next 0 continue } if {$token == "%prec"} { set prec_next 1 continue } if {[regexp -- {\A\'(.)\'\Z} $token foo c]} { add_token $c $::TERMINAL 1 0 nonassoc set token $c } if {$token != ""} { if {[string range $token 0 6] == "@PSEUDO"} { set ::rule_table([expr {$::rule_count - 1}],e) $deriv_count } lappend deriv_list $token incr deriv_count } } if {$prec_next} { taccle_error "%prec modifier has no associated terminal symbol" $::PARAM_ERROR } incr ::line_count $num_lines set ::rule_table($::rule_count,l) $lhs set ::rule_table($::rule_count,d) $deriv_list set ::rule_table($::rule_count,dc) [llength $deriv_list] set ::rule_table($::rule_count,a) $action set ::rule_table($::rule_count,line) $::line_count incr ::rule_count } } # Post-process the grammar by augmenting it and and replacing all # tokens with their id values. proc rewrite_grammar {} { set ::rule_table(0,l) "start'" if {[info exists ::start_symbol]} { if {![info exists ::token_table($::start_symbol)]} { taccle_error "Token given by %start does not exist" $::PARAM_ERROR } if {$::token_table($::start_symbol,t) == $::TERMINAL} { taccle_error "Token given by %start is a terminal." $::PARAM_ERROR } set ::rule_table(0,d) $::start_symbol } else { set ::rule_table(0,d) $::rule_table(1,l) } set ::rule_table(0,dc) 1 set ::rule_table(0,prec) 0 set ::start_token_id [add_token "start'" $::NONTERMINAL 0 0 nonassoc] set ::token_list [lsort -command tokid_compare $::token_list] # now go through grammar and replace all token names with their id # number for {set i 0} {$i < $::rule_count} {incr i} { set ::rule_table($i,l) $::token_table($::rule_table($i,l)) set new_deriv_list "" foreach deriv $::rule_table($i,d) { if {![info exists ::token_table($deriv)]} { taccle_error "Symbol $deriv used, but is not defined as a token and has no rules." $::GRAMMAR_ERROR } lappend new_deriv_list $::token_table($deriv) } set ::rule_table($i,d) $new_deriv_list # set the rule's precedence only if it was not already specified if {![info exist ::rule_table($i,prec)]} { set ::rule_table($i,prec) [get_prec $new_deriv_list] } } # check for unused tokens set used_list [concat "error" [recurse_dfs $::start_token_id ""]] foreach tok_id $::token_list { if {[lsearch -exact $used_list $tok_id] == -1} { taccle_warn "Token $::token_id_table($tok_id) unused." } else { lappend ::used_token_list $tok_id } } # add to the used token list {$} but /not/ start' set ::used_token_list [concat [lrange $::used_token_list 0 end-1] \ $::token_table("\$")] } # Determine which non-terminals are nullable. Any terminal which can # be simplified to just an epsilon transition is nullable. proc generate_nullable_table {} { set nullable_found 1 while {$nullable_found} { set nullable_found 0 foreach tok_id $::token_list { if {[info exist ::nullable_table($tok_id)]} { continue } if {$::token_id_table($tok_id,t) == $::TERMINAL} { set ::nullable_table($tok_id) 0 continue } for {set i 0} {$i < $::rule_count} {incr i} { set lhs $::rule_table($i,l) if {$lhs != $tok_id} { continue } set rhs [lindex $::rule_table($i,d) 0] if {$rhs == ""} { set ::nullable_table($lhs) 1 set nullable_found 1 } else { set nullable 0 foreach r $rhs { if {[info exists ::nullable_table($r)]} { set nullable $::nullable_table($r) break } } if {$nullable} { set ::nullable_table($lhs) 1 set nullable_found 1 } } } } } foreach tok_id $::token_list { if {![info exist ::nullable_table($tok_id)]} { set ::nullable_table($tok_id) 0 } } } # Generate the table of FIRST symbols for the grammar. proc generate_first_table {} { foreach tok_id $::token_list { generate_first_recurse $tok_id "" } } # Recursively calculates the FIRST set for a given token, handling # nullable terminals as well. # # @param tok_id id of token to generate FIRST set # @param history list of tokens already examined # @return list of tokens (including -1 for epsilon) in tok_id's FIRST set proc generate_first_recurse {tok_id history} { if {[lsearch -exact $history $tok_id] >= 0} { return "" } if {[info exists ::first_table($tok_id)]} { return $::first_table($tok_id) } if {$::token_id_table($tok_id,t) == $::TERMINAL} { set ::first_table($tok_id) $tok_id return $tok_id } # FIRST = union of all first non-terminals on rhs. if a # non-terminal is nullable, then add FIRST of the following # terminal to the FIRST set. keep repeating while nullable. set first_union "" for {set i 0} {$i < $::rule_count} {incr i} { set lhs $::rule_table($i,l) if {$lhs != $tok_id} { continue } if {$::rule_table($i,dc) == 0} { # empty rule, so add the special epsilon marker -1 to the FIRST set lappend first_union -1 } else { foreach r $::rule_table($i,d) { lconcat first_union [generate_first_recurse $r [concat $history $tok_id]] if {$::nullable_table($r) == 0} { break } } } } set ::first_table($tok_id) [lsort -increasing -unique $first_union] return $first_union } # Generate the table of FOLLOW symbols for the grammar. proc generate_follow_table {} { set ::follow_table($::token_table(start')) $::token_table("\$") foreach tok_id $::token_list { generate_follow_recurse $tok_id "" } } # Recursively calculates the FOLLOW set for a given token, handling # nullable terminals as well. # # @param tok_id id of token to generate FOLLOW set # @param history list of tokens already examined # @return list of tokens in tok_id's FOLLOW set proc generate_follow_recurse {tok_id history} { if {[lsearch -exact $history $tok_id] >= 0} { return "" } if {[info exists ::follow_table($tok_id)]} { return $::follow_table($tok_id) } set follow_union "" for {set i 0} {$i < $::rule_count} {incr i} { # if the token is on the rhs of the rule then FOLLOW includes # the FIRST of the token following it; if at end of rule (or # can be derived to end of rule) then FOLLOW includes the # FOLLOW of the lhs set rhs $::rule_table($i,d) for {set j [expr {$::rule_table($i,dc) - 1}]} {$j >= 0} {incr j -1} { set r [lindex $rhs $j] if {$r != $tok_id} { continue } set k [expr {$j + 1}] set gamma [lindex $rhs $k] if {$gamma != ""} { lconcat follow_union [all_but_eps $::first_table($gamma)] } set at_end_of_list 1 while {$k < $::rule_table($i,dc)} { if {![has_eps $::first_table([lindex $rhs $k])]} { set at_end_of_list 0 break } incr k } if {$at_end_of_list} { set lhs $::rule_table($i,l) lconcat follow_union [generate_follow_recurse $lhs [concat $history $tok_id]] } } } set ::follow_table($tok_id) [lsort -increasing -unique $follow_union] return $follow_union } # Construct a canonical LR(1) by taking the start rule (rule 0) and # successively adding closures/states until no more new states. proc generate_lr1 {} { # first add start rule to the closure list set first_item [list [list 0 $::token_table("\$") 0]] set first_closure [add_closure $first_item 0 1] set ::lr1_table(0) [concat $first_item $first_closure] # used to keep count of total number of states produced by LR(1) set ::next_lr1_state 1 # keep generating items until none remain for {set state_pointer 0} {$state_pointer < $::next_lr1_state} {incr state_pointer} { # iterate through each token, adding transitions to new state(s) set trans_list "" set oldclosure_list $::lr1_table($state_pointer) foreach tok_id $::token_list { set todo_list "" set working_list "" foreach item $oldclosure_list { foreach {rule lookahead position} $item {} if {$position >= $::rule_table($rule,dc)} { # at end of rule; don't expand (and remove it # from the list) continue } set nexttoken [lindex $::rule_table($rule,d) $position] if {$nexttoken == $tok_id} { # item's next token matches the one currently # saught; add it to the working list lappend working_list $item } else { # item was not used yet -- add it back to the # todo list lappend todo_list $item } } set oldclosure_list $todo_list if {$working_list != ""} { set new_closure "" foreach item $working_list { # move pointer ahead to the next position foreach {rule lookahead position} $item {} incr position set newitem [list $rule $lookahead $position] lappend new_closure $newitem } set new_closure [concat $new_closure \ [add_closure $new_closure 0 [llength $working_list]]] # add a transition out of this state -- to a # previously examined state if possible, or else # create a new state with my new closure set next_state -1 for {set i 0} {$i < $::next_lr1_state} {incr i} { if {[lsort $::lr1_table($i)] == [lsort $new_closure]} { set next_state $i break } } if {$next_state == -1} { # create a new state set ::lr1_table($::next_lr1_state) $new_closure lappend trans_list [list $tok_id $::next_lr1_state] incr ::next_lr1_state } else { # reuse existing state lappend trans_list [list $tok_id $next_state] } } } set ::lr1_table($state_pointer,trans) [lsort -command tokid_compare -index 0 $trans_list] } } # Successively add closures from LR(1) table to LALR(1) table merging # kernels with similar cores. proc generate_lalr1 {} { for {set i 0} {$i < $::next_lr1_state} {incr i} { # as matching closures are found change their mapping here set state_mapping_table($i) $i } # go through all elements of LR(1) table and generate their cores. # this will make future comparisons easier. for {set i 0} {$i < $::next_lr1_state} {incr i} { set core "" foreach item $::lr1_table($i) { lappend core [list [lindex $item 0] [lindex $item 2]] } set core_table($i) [lsort $core] } lappend new_lalr_states(0) 0 for {set i 1} {$i < $::next_lr1_state} {incr i} { set found_matching 0 for {set j 0} {$j < $i} {incr j} { if {$core_table($i) == $core_table($j)} { # found a matching core -- change its mapping set state_mapping_table($i) $state_mapping_table($j) # because this state is being eliminated, shuffle all # future states down one for {set k [expr {$i + 1}]} {$k < $::next_lr1_state} {incr k} { incr state_mapping_table($k) -1 } # merge state $i into state $j lappend new_lalr_states($j) $i set found_matching 1 break } } if {!$found_matching} { lappend new_lalr_states($i) $i } } # now copy items from LR(1) table to LALR(1) table set ::next_lalr1_state 0 for {set i 0} {$i < $::next_lr1_state} {incr i} { if {![info exists new_lalr_states($i)]} { # state no longer exists (it got merged into another one) continue } # first merge together all lookaheads set ::lalr1_table($::next_lalr1_state) $::lr1_table([lindex $new_lalr_states($i) 0]) foreach state [lrange $new_lalr_states($i) 1 end] { set ::lalr1_table($::next_lalr1_state) \ [merge_closures $::lalr1_table($::next_lalr1_state) $::lr1_table($state)] } # now rewrite the transition table foreach trans $::lr1_table($i,trans) { foreach {symbol new_state} $trans {} lappend ::lalr1_table($::next_lalr1_state,trans) \ [list $symbol $state_mapping_table($new_state)] } incr ::next_lalr1_state } } # Takes the LALR(1) table and resolves precedence issues by removing # transitions whenever the precedence values indicate a reduce instead # of a shift. proc resolve_precedences {} { for {set i 0} {$i < $::next_lalr1_state} {incr i} { # scan through all kernel items that are at the end of their # rule. for those, use the precedence table to decide to keep # a transition (a shift) or not (a reduce) foreach item $::lalr1_table($i) { foreach {rule lookahead position} $item {} if {$position < $::rule_table($rule,dc) || \ ![info exist ::lalr1_table($i,trans)]} { continue } set rule_prec_tok $::rule_table($rule,prec) set rule_prec_level $::prec_table($rule_prec_tok) set rule_prec_dir $::prec_table($rule_prec_tok,dir) set new_trans "" foreach trans $::lalr1_table($i,trans) { set trans_tok [lindex $trans 0] if {[lsearch $lookahead $trans_tok] == -1} { lappend new_trans $trans continue } set trans_tok_level $::prec_table($trans_tok) set trans_tok_dir $::prec_table($trans_tok,dir) if {$rule_prec_dir == "nonassoc" || \ $trans_tok_dir == "nonassoc" || \ $rule_prec_level < $trans_tok_level || \ ($rule_prec_level == $trans_tok_level && $rule_prec_dir == "right")} { # precedence says to shift, so keep this transition lappend new_trans $trans } else { taccle_warn "Conflict in state $i between rule $rule and token \"$trans_tok\", resolved as reduce." } } set ::lalr1_table($i,trans) $new_trans } } } # Check if the grammar contains any infinite recursions. proc check_recursions {} { set cleared "" for {set i 0} {$i < $::next_lalr1_state} {incr i} { if {[lsearch -exact $cleared $i] >= 0} { continue } set cleared [get_cleared $i {} $cleared] } } # Recursively performs a DFS search through the LALR(1) table to check # for cycles. In each node check if the position is at the end of any # rule; this marks the node is "reducible" and it is added to the # 'cleared' list. Otherwise recurse on each terminal transitioning # out of this state. If a state and all of its transitions are not # reducible then abort with an error. # # @param state which state within the LALR(1) table to examine # @param history a list of states so far examined on this pass # @param cleared a list of states which have already been verified as reducible # # @return a new cleared list, or an empty list of this state is not reducible proc get_cleared {state history cleared} { if {[lsearch -exact $cleared $state] >= 0} { return $cleared } if {[lsearch -exact $history $state] >= 0} { return {} } # check if any items in this closure are reducible; if so then # this state passes set token -1 foreach item $::lalr1_table($state) { foreach {rule lookahead position} $item {} if {$position == $::rule_table($rule,dc)} { return [concat $cleared $state] } if {$position == 0} { set token $::rule_table($rule,l) } } # recursively check all terminals transitioning out of this state; # if none of the new states eventually reduce then report this as # a cycle foreach trans $::lalr1_table($state,trans) { foreach {tok_id nextstate} $trans {} if {$::token_id_table($tok_id,t) == $::TERMINAL} { set retval [get_cleared $nextstate [concat $history $state] $cleared] if {[llength $retval] > 0} { return [concat $retval $state] } } } if {$token == -1} { puts stderr "OOPS: should not have gotten here!" exit -1 } set ::line_count $::rule_table($rule,line) taccle_error "Token $::token_id_table($token) appears to recurse infinitely" $::GRAMMAR_ERROR } # Takes the LALR(1) table and generates the LALR(1) transition table. # For terminals do a shift to the new state. For non-terminals reduce # when the next token is a lookahead. Detect shift/reduce conflicts; # resolve by giving precedence to shifting. Detect reduce/reduce # conflicts and resolve by reducing to the first rule found. proc generate_lalr1_parse_table {} { for {set i 0} {$i < $::next_lalr1_state} {incr i} { foreach item $::lalr1_table($i) { foreach {rule lookahead position} $item {} if {$position >= $::rule_table($rule,dc)} { if {$rule == 0} { set command "accept" } else { set command "reduce" } set token_list $lookahead # target for a reduce/accept is which rule to use # while accepting set target $rule } else { set token [lindex $::rule_table($rule,d) $position] if {$::token_id_table($token,t) == $::TERMINAL} { set command "shift" } else { set command "goto" } set token_list [list $token] # target for a shift/goto is the new state to move to set target "" foreach trans $::lalr1_table($i,trans) { foreach {tok_id nextstate} $trans {} if {$tok_id == $token} { set target $nextstate break } } # this token must have been consumed by shift/reduce # conflict resolution through the precedence table # (above) if {$target == ""} { continue } } foreach token $token_list { # check for shift/reduce conflicts if {[info exists ::lalr1_parse($i:$token)] && \ $::lalr1_parse($i:$token) != $command} { # shifting takes precedence, so overwrite table # entry if needed if {$::lalr1_parse($i:$token) == "shift"} { taccle_warn "Shift/Reduce error in state $i, token \"$::token_id_table($token)\", resolved by keeping shift." break } taccle_warn "Shift/Reduce error in state $i between rule $::lalr1_parse($i:$token,target) and token \"$::token_id_table($token)\", resolved as shift." unset ::lalr1_parse($i:$token,target) } set ::lalr1_parse($i:$token) $command # check for reduce/reduce conflicts # (theoretically it is impossible to have a shift/shift error) if {[info exists ::lalr1_parse($i:$token,target)] && \ $::lalr1_parse($i:$token,target) != $target} { taccle_warn "Reduce/Reduce error in state $i, token \"$::token_id_table($token)\", resolved by reduce to rule $::lalr1_parse($i:$token,target)." break } set ::lalr1_parse($i:$token,target) $target } } } } ###################################################################### # utility routines that actually handle writing parser to output files # Writes to the destination file utility functions called by yyparse # as well as by user-supplied actions. proc write_parser_utils {} { puts $::dest " ###### # Begin autogenerated taccle (version $::TACCLE_VERSION) routines. # Although taccle itself is protected by the GNU Public License (GPL) # all user-supplied functions are protected by their respective # author's license. See http://mini.net/tcl/taccle for other details. ###### proc ${::P}ABORT \{\} \{ return -code return 1 \} proc ${::P}ACCEPT \{\} \{ return -code return 0 \} proc ${::p}clearin \{\} \{ upvar ${::p}token t set t \"\" \} proc ${::p}error \{s\} \{ puts stderr \$s \} proc ${::p}setupvalues \{stack pointer numsyms\} \{ upvar 1 1 y set y \{\} for \{set i 1\} \{\$i <= \$numsyms\} \{incr i\} \{ upvar 1 \$i y set y \[lindex \$stack \$pointer\] incr pointer \} \} proc ${::p}unsetupvalues \{numsyms\} \{ for \{set i 1\} \{\$i <= \$numsyms\} \{incr i\} \{ upvar 1 \$i y unset y \} \}" } # Writes to the destination file the actual parser including LALR(1) # table. proc write_parser {} { write_array $::dest ::${::p}table [array get ::lalr1_parse] write_array $::dest ::${::p}rules [array get ::rule_table *l] write_array $::dest ::${::p}rules [array get ::rule_table *dc] write_array $::dest ::${::p}rules [array get ::rule_table *e] puts $::dest "\nproc ${::p}parse {} { set ${::p}state_stack {0} set ${::p}value_stack {{}} set ${::p}token \"\" set ${::p}accepted 0 while {\$${::p}accepted == 0} { set ${::p}state \[lindex \$${::p}state_stack end\] if {\$${::p}token == \"\"} { set ::${::p}lval \"\" set ${::p}token \[${::p}lex\] set ${::p}buflval \$::${::p}lval } if {!\[info exists ::${::p}table(\$${::p}state:\$${::p}token)\]} { \# pop off states until error token accepted while {\[llength \$${::p}state_stack\] > 0 && \\ !\[info exists ::${::p}table(\$${::p}state:error)]} { set ${::p}state_stack \[lrange $${::p}state_stack 0 end-1\] set ${::p}value_stack \[lrange $${::p}value_stack 0 \\ \[expr {\[llength $${::p}state_stack\] - 1}\]\] set ${::p}state \[lindex $${::p}state_stack end\] } if {\[llength \$${::p}state_stack\] == 0} { ${::p}error \"parse error\" return 1 } lappend ${::p}state_stack \[set ${::p}state \$::${::p}table($${::p}state:error,target)\] lappend ${::p}value_stack {} \# consume tokens until it finds an acceptable one while {!\[info exists ::${::p}table(\$${::p}state:\$${::p}token)]} { if {\$${::p}token == 0} { ${::p}error \"end of file while recovering from error\" return 1 } set ::${::p}lval {} set ${::p}token \[${::p}lex\] set ${::p}buflval \$::${::p}lval } continue } switch -- \$::${::p}table(\$${::p}state:\$${::p}token) { shift { lappend ${::p}state_stack \$::${::p}table(\$${::p}state:\$${::p}token,target) lappend ${::p}value_stack \$${::p}buflval set ${::p}token \"\" } reduce { set ${::p}rule \$::${::p}table(\$${::p}state:\$${::p}token,target) set ${::p}l \$::${::p}rules(\$${::p}rule,l) if \{\[info exists ::${::p}rules(\$${::p}rule,e)\]\} \{ set ${::p}dc \$::${::p}rules(\$${::p}rule,e) \} else \{ set ${::p}dc \$::${::p}rules(\$${::p}rule,dc) \} set ${::p}stackpointer \[expr {\[llength \$${::p}state_stack\]-\$${::p}dc}\] ${::p}setupvalues \$${::p}value_stack \$${::p}stackpointer \$${::p}dc set _ \$1 set ::${::p}lval \[lindex \$${::p}value_stack end\] switch -- \$${::p}rule {" for {set i 0} {$i < $::rule_count} {incr i} { if {[info exists ::rule_table($i,a)] && [string trim $::rule_table($i,a)] != ""} { puts $::dest " $i { $::rule_table($i,a) }" } } puts $::dest " } ${::p}unsetupvalues \$${::p}dc # pop off tokens from the stack if normal rule if \{!\[info exists ::${::p}rules(\$${::p}rule,e)\]\} \{ incr ${::p}stackpointer -1 set ${::p}state_stack \[lrange \$${::p}state_stack 0 \$${::p}stackpointer\] set ${::p}value_stack \[lrange \$${::p}value_stack 0 \$${::p}stackpointer\] \} # now do the goto transition lappend ${::p}state_stack \$::${::p}table(\[lindex \$${::p}state_stack end\]:\$${::p}l,target) lappend ${::p}value_stack \$_ } accept { set ${::p}accepted 1 } goto - default { puts stderr \"Internal parser error: illegal command \$::${::p}table(\$${::p}state:\$${::p}token)\" return 2 } } } return 0 } ###### # end autogenerated taccle functions ###### " } # Pretty-prints an array to a file descriptor. Code contributed by # jcw. # # @param fd file descriptor to which write the array # @param name name of array to declare within the file # @param values list of 2-ple values proc write_array {fd name values} { puts $fd "\narray set $name {" foreach {x y} $values { puts $fd " [list $x $y]" } puts $fd "}" } # Writes a header file that should be [source]d by the lexer. proc write_header_file {} { # scan through token_table and write out all non-implicit terminals foreach tok_id $::token_list { if {$::token_id_table($tok_id,t) == $::TERMINAL && \ [string is integer $tok_id] && $tok_id >= 256} { set token $::token_id_table($tok_id) puts $::header "set ::${token} $tok_id" } } puts $::header "set ::${::p}lval \{\}" } ###################################################################### # utility functions # Adds a token to the token table, checking that it does not already # exist. Returns the ID for the token (either old one if token # already exists or the newly assigned id value). # # @param token_name name of token to add # @param type type of token, either $::TERMINAL or $::NON_TERMINAL # @param implicit for $::TERMINAL tokens, 1 if the token is implicitly # declared # @param prec_level precedence level for token # @param prec_dir direction of precedence, either left, # right, or nonassoc # @return id value for this token proc add_token {token_name type implicit prec_level prec_dir} { if {$token_name == "\$"} { taccle_error "The token '$' is reserved and may not be used in productions." $::SYNTAX_ERROR } if {$token_name == "\{" || $token_name == 0} { taccle_error "Literal value $token_name not allowed; define a %token instead" $::SYNTAX_ERROR } if [info exists ::token_table($token_name)] { set id $::token_table($token_name) if {$::token_table($token_name,t) == $type} { # token already exists; modify its precedence level if necessary if {$::prec_table($id) < $prec_level} { taccle_warn "Redefining precedence of $token_name" set ::prec_table($id) $prec_level set ::prec_table($id,dir) $prec_dir } set ::token_id_table($id,line) $::line_count return $id } set old_type [expr {$::token_table($token_name,t) == 1 ? "non-" : ""}]terminal taccle_error "Token $token_name already declared as a $old_type" $::GRAMMAR_ERROR } if $implicit { set ::token_table($token_name) $token_name set id $token_name } else { set ::token_table($token_name) $::next_token_id set id $::next_token_id incr ::next_token_id } set ::token_table($token_name,t) $type set ::token_id_table($id) $token_name set ::token_id_table($id,t) $type set ::token_id_table($id,line) $::line_count lappend ::token_list $id set ::prec_table($id) $prec_level set ::prec_table($id,dir) $prec_dir return $id } # Adds closures to each item on $closure_list, starting from the index # $closure_pointer. Keeps adding closures until no more are added. # # @param closure_list list of closures to process # @param closure_pointer index into $closure_list to which start # @param original_length original size of $closure_list # @return list of closures added proc add_closure {closure_list closure_pointer original_length} { set orig_closure_pointer [expr {$closure_pointer + $original_length}] # keep adding items to the closure list until no more while {$closure_pointer < [llength $closure_list]} { set item [lindex $closure_list $closure_pointer] incr closure_pointer foreach {rule lookahead position} $item {} set mylength $::rule_table($rule,dc) if {$position < $mylength} { set nexttoken [lindex $::rule_table($rule,d) $position] if {$::token_id_table($nexttoken,t) == $::TERMINAL} { continue } # the lookahead is the FIRST of the rule /after/ # nexttoken, or the current lookahead if at the end of # rule. if the next token is NULLABLE then the lookahead # includes that which FOLLOWS it set beta_pos [expr {$position + 1}] if {$beta_pos >= $mylength} { set nextfirst $lookahead } else { set n [lindex $::rule_table($rule,d) $beta_pos] set nextfirst [all_but_eps $::first_table($n)] if {$::nullable_table($n)} { set nextfirst [lsort -unique [concat $nextfirst $::follow_table($n)]] } } for {set rule_num 0} {$rule_num < $::rule_count} {incr rule_num} { if {$::rule_table($rule_num,l) != $nexttoken} { continue } set newitem [list $rule_num $nextfirst 0] set closure_list [merge_closures $closure_list [list $newitem]] } } } return [lrange $closure_list $orig_closure_pointer end] } # Recurses through all productions, recording which tokens are # actually used by the grammar. Tokens used to indicate a rule's # precedence are also added. Returns a list of tokens used; note that # this list can (and probably will) include duplicates. # # @param tok_id id of token to start # @param history list of tok_id's already examined # @return list of tokens used proc recurse_dfs {tok_id history} { if {[lsearch -exact $history $tok_id] >= 0} { return $history } if {$::token_id_table($tok_id,t) == $::TERMINAL} { return [concat $history $tok_id] } lappend history $tok_id for {set i 0} {$i < $::rule_count} {incr i} { set lhs $::rule_table($i,l) if {$lhs == $tok_id} { foreach deriv $::rule_table($i,d) { set history [recurse_dfs $deriv $history] } lconcat history $::rule_table($i,prec) } } return $history } # Given a line, returns a new line with any comments removed. # # @param line string with a possible comment # @return line with any commens removed proc strip_comments {line} { regexp -- {\A([^\#]*)} $line foo line return $line } # Combines unique elements of the two closures, also merging lookahead # symbols, and returns the new closure. # # @param closure1 first closure to merge # @param closure2 second closure to merge # @return $closure1 and $closure2 merged together, with duplicated removed proc merge_closures {closure1 closure2} { foreach item2 $closure2 { foreach {rule2 lookahead2 pos2} $item2 {} set found_match 0 for {set i 0} {$i < [llength $closure1]} {incr i} { foreach {rule1 lookahead1 pos1} [lindex $closure1 $i] {} if {$rule2 == $rule1 && $pos2 == $pos1} { set lookahead1 [lsort -uniq [concat $lookahead1 $lookahead2]] lset closure1 $i [list $rule1 $lookahead1 $pos1] set found_match 1 break } } if {!$found_match} { lappend closure1 $item2 } } return $closure1 } # Compares two token id values. If the two are integers then uses # their values for comparison; otherwise performs a string comparison. # Integer values are always "greater than" strings. # # @param a first token id # @param b second token id # @return -1 if a is less than b, 1 if # a is greater, otherwise 0 proc tokid_compare {a b} { if {[string is integer $a] && [string is integer $b]} { if {$a < $b} { return -1 } else { return 1 } } if [string is integer $a] { return 1 } if [string is integer $b] { return -1 } return [string compare $a $b] } # Given a list, returns all everything in it except for any elements # of value "-1", which corresponds with the epsilon symbol. # # @param first_list list of tokens (presumably a FIRST set) # @return new list with all -1 values removed proc all_but_eps {first_list} { set new_list "" foreach tok $first_list { if {$tok != -1} { lappend new_list $tok } } return $new_list } # Returns truth if the element value "-1", corresponding with the # epsilon symbol, resides within the first list $first_list. # # @param first_list list of tokens (presumably a FIRST set) # @return 1 if $first_list has the element -1, 0 otherwise proc has_eps {first_list} { foreach tok $first_list { if {$tok == -1} { return 1 } } return 0 } # Given a list of tokens, returns the token with highest precedence # level. # # @param tok_list list of token ids # @return token with highest precedence; in case of tie returns first # one found proc get_prec {tok_list} { set prec_token 0 foreach tok $tok_list { if {$::prec_table($tok) > $::prec_table($prec_token)} { set prec_token $tok } } return $prec_token } # Appends the first list a flattened version of the second, but only # if the second is non-empty. # # @param list first list # @param lists list of lists to append # @return new list proc lconcat {list lists} { upvar $list l if {$lists != ""} { set l [concat $l $lists] } else { return $l } } # Retrives a parameter from the options list. If no parameter exists # then abort with an error very reminisicent of C's # getopt function; otherwise increment # param_num by one. # # @param param_list list of parameters from the command line # @param param_num index into param_list to retrieve # @param param_name name of the parameter, used when reporting an error # @return the $param_num'th element into $param_list proc get_param {param_list param_num param_name} { upvar $param_num pn incr pn if {$pn >= [llength $param_list]} { puts stderr "taccle: option requires an argument -- $param_name" exit $::PARAM_ERROR } return [lindex $param_list $pn] } # Display to standard error a message, then abort the program. proc taccle_error {message returnvalue} { if {$::verbose != ""} { puts $::verbose "$message (line $::line_count)" } puts stderr "$message (line $::line_count)" exit $returnvalue } # Display a message to standard error if warnings enabled. Write to # the verbose output file if verbose is enabled. proc taccle_warn {message} { if {$::show_warnings} { puts stderr $message } if {$::verbose != ""} { puts $::verbose "$message" } } # Print to a particular channel a brief summary of taccle command line # options. proc print_taccle_help {chan} { puts $chan "taccle: a Tcl compiler compiler Usage: taccle \[options\] file file a taccle grammar specification file Options: -h print this help message and quit -d write extra output file containing Tcl code to be \[source\]d by yylex -o FILE specify name to write parser -v write extra output file containing descriptions of all parser states and extended information about conflicts -w display all warnings to standard error -p PREFIX change default yy prefix to PREFIX --version print taccle version and quit For more information see http://mini.net/tcl/taccle" } # Displays to standard out the taccle version, then exits program. proc print_taccle_version {} { puts "taccle version $::TACCLE_VERSION" exit 0 } ###################################################################### # internal debugging routines proc print_symbol_table {} { puts $::verbose "token table:" puts $::verbose [format "%-5s %-10s %s" "id" "token" "type"] foreach tok_id $::token_list { set token $::token_id_table($tok_id) if {$::token_id_table($tok_id,t) == $::TERMINAL} { set type "terminal" } else { set type "non-terminal" } puts $::verbose [format "%-5s %-10s %s" $tok_id $token $type] } } proc print_rule_table {} { puts $::verbose "rule table:" for {set i 0} {$i < $::rule_count} {incr i} { set lhs $::token_id_table($::rule_table($i,l)) set deriv_list "" foreach deriv $::rule_table($i,d) { lappend deriv_list $::token_id_table($deriv) } if {$deriv_list == ""} { set deriv_list "\#\# empty \#\#" } puts $::verbose [format "%3d: %-10s -> %s" $i $lhs $deriv_list] } } proc print_first_table {} { puts $::verbose "first table:" foreach tok_id $::token_list { if {$tok_id == -1} { continue } set token $::token_id_table($tok_id) set first_list "" foreach first $::first_table($tok_id) { if {$first >= 0} { lappend first_list $::token_id_table($first) } } puts $::verbose [format "%-10s => %s" $token $first_list] } } proc print_closure {closure_list indent dest} { foreach item $closure_list { foreach {rule lookahead position} $item {} set lhs $::token_id_table($::rule_table($rule,l)) set deriv_list "" set i 0 foreach deriv $::rule_table($rule,d) { if {$i == $position} { lappend deriv_list "." } lappend deriv_list $::token_id_table($deriv) incr i } if {$position == $::rule_table($rule,dc)} { lappend deriv_list "." } set lookahead_list "" foreach la $lookahead { lappend lookahead_list $::token_id_table($la) } puts $dest \ [format "%*s %-10s -> %s, %s" $indent "" $lhs $deriv_list $lookahead_list] } } proc print_lr_table {table_name num_entries} { upvar $table_name table for {set i 0} {$i < $num_entries} {incr i} { puts $::verbose "state $i:" print_closure $table($i) 2 $::verbose if {[info exists table($i,trans)] && [llength $table($i,trans)] >= 1} { puts -nonewline $::verbose [format "%*s transitions:" 2 ""] foreach trans $table($i,trans) { foreach {tok_id nextstate} $trans {} puts -nonewline $::verbose " $::token_id_table($tok_id) => s$nextstate" } puts $::verbose "" } puts $::verbose "" } } proc print_lr1_table {} { puts $::verbose "lr(1) table:" print_lr_table ::lr1_table $::next_lr1_state } proc print_lalr1_table {} { puts $::verbose "lalr(1) table:" print_lr_table ::lalr1_table $::next_lalr1_state } proc print_lalr1_parse {} { puts $::verbose "generated lalr(1) parse table:" puts -nonewline $::verbose "state " foreach tok_id $::used_token_list { set token [string range $::token_id_table($tok_id) 0 4] puts -nonewline $::verbose [format " %-5s" $token] } puts $::verbose "" for {set i 0} {$i < $::next_lalr1_state} {incr i} { puts -nonewline $::verbose [format "%4s " $i] foreach tok_id $::used_token_list { if [info exists ::lalr1_parse($i:$tok_id)] { switch -- $::lalr1_parse($i:$tok_id) { shift { set s "sh" } goto { set s "go" } reduce { set s "re" } accept { set s "accept" } } if {$s != "accept"} { append s $::lalr1_parse($i:$tok_id,target) } puts -nonewline $::verbose [format " %-5s" $s] } else { puts -nonewline $::verbose " " } } puts $::verbose "" } } ###################################################################### # other taccle functions # Parse the taccle command line. proc taccle_args {argv} { set argvp 0 set write_defs_file 0 set write_verbose_file 0 set out_filename "" set ::p "yy" set ::P "YY" set ::show_warnings 0 while {$argvp < [llength $argv]} { set arg [lindex $argv $argvp] switch -- $arg { "-d" { set write_defs_file 1 } "-h" - "--help" { print_taccle_help stdout; exit 0 } "-o" { set out_filename [get_param $argv argvp "o"] } "-v" - "--verbose" { set write_verbose_file 1 } "-w" { set ::show_warnings 1 } "-p" { set prefix [get_param $argv argvp "p"] set ::p [string tolower $prefix] set ::P [string toupper $prefix] } "--version" { print_taccle_version } default { if {[string index $arg 0] != "-"} { break } else { puts stderr "taccle: unknown option $arg" print_taccle_help stderr exit $::PARAM_ERROR } } } incr argvp } if {$argvp >= [llength $argv]} { puts stderr "taccle: no grammar file given" print_taccle_help stderr exit $::IO_ERROR } set in_filename [lindex $argv $argvp] if {$out_filename == ""} { set out_filename [file rootname $in_filename] append out_filename ".tcl" } if [catch {open $in_filename r} ::src] { puts stderr "Could not open grammar file '$in_filename'." exit $::IO_ERROR } if [catch {open $out_filename w} ::dest] { puts stderr "Could not open output file '$out_filename'." exit $::IO_ERROR } if $write_defs_file { set header_filename "[file rootname $out_filename].tab.tcl" if [catch {open $header_filename w} ::header] { puts stderr "Could not open header file '$header_filename'." exit $::IO_ERROR } } else { set ::header "" } if $write_verbose_file { set verbose_filename "[file rootname $out_filename].output" if [catch {open $verbose_filename w} ::verbose] { puts stderr "Could not open verbose file '$verbose_filename'." exit $::IO_ERROR } } else { set ::verbose "" } } # Actually do the parser generation. proc taccle_main {} { set ::line_count 0 # counts number of rules in the grammar # rule number 0 is reserved for the special augmentation S' -> S set ::rule_count 1 # used to keep track of token IDs: # 0 is reserved for the special token '$' # 256 for the error token set ::next_token_id 257 # used to keep track of operator precedence level # level 0 is reserved for terminals without any precedence set ::next_precedence 1 # keep track of where within the file I am: # definitions, rules, or subroutines set file_state definitions while {[gets $::src line] >= 0} { incr ::line_count if {$line == "%%"} { if {$file_state == "definitions"} { set file_state "rules" } elseif {$file_state == "rules"} { set file_state "subroutines" } else { taccle_error "Syntax error." $::SYNTAX_ERROR } } else { if {$file_state == "definitions"} { handle_defs $line } elseif {$file_state == "rules"} { # keep reading the rest of the file until EOF or # another '%%' appears set rules_buf [strip_comments $line] while {[gets $::src line] >= 0 && $file_state == "rules"} { if {$line == "%%"} { set file_state "subroutines" } else { append rules_buf "\n" [strip_comments $line] } } build_parser $rules_buf set file_state "subroutines" write_parser_utils write_parser } else { # file_state is subroutines -- copy verbatim to output file puts $::dest $line } } } if {$::header != ""} { write_header_file } if {$::verbose != ""} { print_symbol_table puts $::verbose "" print_rule_table puts $::verbose "" #print_first_table #puts $::verbose "" #print_lr1_table print_lalr1_table print_lalr1_parse } } ###################################################################### # start of actual script set IO_ERROR 1 set SYNTAX_ERROR 2 set PARAM_ERROR 3 set GRAMMAR_ERROR 4 set TERMINAL 0 set NONTERMINAL 1 taccle_args $argv taccle_main