diff options
Diffstat (limited to 'tcllib/modules/grammar_fa/faop.tcl')
-rw-r--r-- | tcllib/modules/grammar_fa/faop.tcl | 1618 |
1 files changed, 1618 insertions, 0 deletions
diff --git a/tcllib/modules/grammar_fa/faop.tcl b/tcllib/modules/grammar_fa/faop.tcl new file mode 100644 index 0000000..5c0804e --- /dev/null +++ b/tcllib/modules/grammar_fa/faop.tcl @@ -0,0 +1,1618 @@ +# -*- tcl -*- +# Grammar / FA / Operations + +# ### ### ### ######### ######### ######### +## Package description + +# ### ### ### ######### ######### ######### +## Requisites + +package require struct::list ; # Extended list operations. +package require struct::set ; # Extended set operations. + +# ### ### ### ######### ######### ######### +## Implementation + +namespace eval ::grammar::fa::op { + + # ### ### ### ######### ######### ######### + ## API. Structure / Language / Compilation + + proc reverse {fa} {} + proc complete {fa {sink {}}} {} + proc remove_eps {fa} {} + proc trim {fa {what !reachable|!useful}} {} + proc determinize {fa {mapvar {}} {idstart 0}} {} + proc minimize {fa {mapvar {}}} {} + + proc complement {fa} {} + proc kleene {fa} {} + proc optional {fa} {} + proc union {fa fb {mapvar {}}} {} + proc intersect {fa fb {mapvar {}} {idstart 0}} {} + proc difference {fa fb {mapvar {}}} {} + proc concatenate {fa fb {mapvar {}}} {} + + proc fromRegex {fa regex {over {}}} {} + + proc toRegexp {fa} {} + proc toRegexp2 {fa} {} + + proc simplifyRegexp {rex} {} + proc toTclRegexp {rex symdict} {} + + # ### ### ### ######### ######### ######### + + namespace export reverse complete remove_eps trim \ + determinize minimize complement kleene \ + optional union intersect difference \ + concatenate fromRegex toRegexp toRegexp2 \ + simplifyRegexp toTclRegexp + + # ### ### ### ######### ######### ######### + ## Internal data structures. + + variable cons {} + + # ### ### ### ######### ######### ######### +} + +# ### ### ### ######### ######### ######### +## API implementation. Structure + +proc ::grammar::fa::op::reverse {fa} { + # Reversal means that all transitions change their direction + # and start and final states are swapped. + + # Note that reversed FA might not be deterministic, even if the FA + # itself was. + + # One loop is not enough for this. If we reverse the + # transitions for a state immediately we may modify a state + # which has not been processed yet. And when we come to this + # state we reverse already reversed transitions, creating a + # complete mess. Thus two loops, one to collect the current + # transitions (and also remove them), and a second to insert + # the reversed transitions. + + set tmp [$fa finalstates] + $fa final set [$fa startstates] + $fa start set $tmp + + # FUTURE : Method to retrieve all transitions + # FUTURE : Method to delete all transitions + + set trans {} + foreach s [$fa states] { + foreach sym [$fa symbols@ $s] { + lappend trans $s $sym [$fa next $s $sym] + $fa !next $s $sym + } + } + foreach {s sym destinations} $trans { + foreach d $destinations { + $fa next $d $sym --> $s + } + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::complete {fa {sink {}}} { + if {[$fa is complete]} return + + # We have an incomplete FA. + + if {$sink eq ""} { + set sink [FindNewState $fa sink] + } elseif {[$fa state exists $sink]} { + return -code error "The chosen sink state exists already" + } + $fa state add $sink + + # Add transitions to it from all states which are not + # complete. The sink state itself loops on all inputs. IOW it is a + # non-useful state. + + set symbols [$fa symbols] + foreach sym $symbols { + $fa next $sink $sym --> $sink + } + + if {[$fa is epsilon-free]} { + foreach s [$fa states] { + foreach missing [struct::set difference \ + $symbols \ + [$fa symbols@ $s]] { + $fa next $s $missing --> $sink + } + } + } else { + # For an FA with epsilon-transitions we cannot simply look at + # the direct transitions to find the used symbols. We have to + # determine this for the epsilon-closure of the state in + # question. Oh, and we have to defer actually adding the + # transitions after we have picked them all, or otherwise the + # newly added transitions throw the symbol calculations for + # epsilon closures off. + + set new {} + foreach s [$fa states] { + foreach missing [struct::set difference \ + $symbols \ + [$fa symbols@set [$fa epsilon_closure $s]]] { + lappend new $s $missing + } + } + + foreach {s missing} $new { + $fa next $s $missing --> $sink + } + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::remove_eps {fa} { + # We eliminate all epsilon transitions by duplicating a number + # of regular transitions, which we get through the epsilon + # closure of the states having epsilon transitions. We do + # nothing if the FA is epsilon free to begin with. + + if {[$fa is epsilon-free]} return + + # Note: Epsilon transitions touching start and final states + # propagate the start markers forward and final markers + # backward. We do this first by propagating start markers twice, + # once with a reversed FA. This also gives us some + # epsilon-closures as well. + + foreach n {1 2} { + foreach s [$fa startstates] { + foreach e [$fa epsilon_closure $s] { + $fa start add $e + } + } + reverse $fa + } + + # Now duplicate all transitions which are followed or preceeded by + # epsilon transitions of any number greater than zero. + + # Note: The closure computations done by the FA are cached in the + # FA, so doing it multiple times is no big penalty. + + # FUTURE : Retrieve all transitions on one command. + + # FUTURE : Different algorithm ... + # Retrieve non-eps transitions for all states ... + # Iterate this list. Compute e-closures for endpoints, cache + # them. Duplicate the transition if needed, in that case add it to + # the end of the list, for possible more duplication (may touch + # different e-closures). Stop when the list is empty again. + + set changed 1 + while {$changed} { + set changed 0 + foreach s [$fa states] { + foreach sym [$fa symbols@ $s] { + set dest [$fa next $s $sym] + if {$sym eq ""} { + # Epsilon transitions. + + # Get the closure, and duplicate all transitions for all + # non-empty symbols as transitions of the original state. + # This may lead to parallel transitions between states, hence + # the catch. It prevents the generated error from stopping the + # action, and no actual parallel transitions are created. + + set clos [$fa epsilon_closure $s] + foreach csym [$fa symbols@set $clos] { + if {$csym eq ""} continue + foreach d [$fa nextset $clos $csym] { + if {![catch {$fa next $s $csym --> $d} msg]} { + set changed 1 + } + } + } + } else { + # Regular transition. Go through all destination + # states, compute their closures and replicate the + # transition if the closure contains more than the + # destination itself, to all states in the closure. + + foreach d $dest { + set clos [$fa epsilon_closure $d] + if {[llength $clos] > 1} { + foreach e $clos { + if {![catch {$fa next $s $sym --> $e}]} { + set changed 1 + } + } + } + } + } + } + } + } + + # At last, drop the epsilons for all states. Only now is this + # possible because otherwise we might compute bad epsilon + # closures in the previous loop. + + foreach s [$fa states] { + $fa !next $s "" + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::trim {fa {what !reachable|!useful}} { + # Remove various unwanted pices from the FA. + + switch -exact -- $what { + !reachable { + set remove [$fa unreachable_states] + } + !useful { + set remove [$fa unuseful_states] + } + !reachable&!useful - + !(reachable|useful) { + set remove [struct::set intersect [$fa unreachable_states] [$fa unuseful_states]] + } + !reachable|!useful - + !(reachable&useful) { + set remove [struct::set union [$fa unreachable_states] [$fa unuseful_states]] + } + default { + return -code error "Expected !reachable, !useful, !reachable&!useful, !(reachable|useful), !reachable|!useful, or !(reachable&useful), got \"$what\"" + } + } + + foreach s $remove { + $fa state delete $s + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::determinize {fa {mapvar {}} {idstart 0}} { + # We do the operation in several stages instead of jumping + # directly in the subset construction. Basically we try the less + # expensive operations first to see if they are enough. It does + # help that they will us also bring nearer to the ultimate goal + # even if they are not enough. + + set hasmap 0 + if {$mapvar ne ""} { + upvar 1 $mapvar map ; set hasmap 1 + } + + # First, is the input already deterministic ? + # There is nothing to do in that case. + + if {[$fa is deterministic]} { + if {$hasmap} {set map {}} + return + } + + # Second, trim unreachable and unuseables. We are done if only + # they carried the non-determinism. Otherwise we might have made + # the FA smaller and was less time consuming to convert. + + if {[llength [$fa startstates]]} {trim $fa !reachable} + if {[llength [$fa finalstates]]} {trim $fa !useful} + if {[$fa is deterministic]} { + if {$hasmap} {set map {}} + return + } + + # Third, remove any epsilon transitions, and stop if that was + # enough. Of course, weed out again states which have become + # irrelevant. The removal of the epsilons will at least ensure + # that the subset construction won't have to deal with + # closures. I.e. simpler. + + remove_eps $fa + if {[llength [$fa startstates]]} {trim $fa !reachable} + if {[llength [$fa finalstates]]} {trim $fa !useful} + if {[$fa is deterministic]} { + if {$hasmap} {set map {}} + return + } + + # Fourth. There is no way to avoid the subset construction. + # Dive in. This is the only part of the algorithm which requires + # us to keep a map. We construct the dfa in a transient container + # and copy the result back to fa when completed. + + array set subsets {} + set id $idstart + set pending {} + set dfa [[cons] %AUTO%] + # FUTURE : $dfa symbol set [$fa symbols] + foreach sym [$fa symbols] {$dfa symbol add $sym} + + # If we have start states we can initialize the algorithm with + # their set. Otherwise we have to the single-element sets of all + # states as the beginning. + + set starts [$fa startstates] + if {[llength $starts] > 0} { + # Make the set of start states the initial stae of the result. + + set starts [lsort $starts] ; # Sort to get canonical form. + $dfa state add $id + $dfa start add $id + + # The start may also be a final state + if {[$fa final?set $starts]} { + $dfa final add $id + } + + set subsets(dfa,$starts) $id + set subsets(nfa,$id) $starts + + lappend pending $id + incr id + } else { + # Convert all states of the input into sets (of one element) + # in the output. Do not forget to mark all final states we + # come by. No start states, otherwise we wouldn't be here. + + foreach s [$fa states] { + set nfaset [list $s] + + $dfa state add $id + if {[$fa final? $s]} { + $dfa final add $id + } + + set subsets(dfa,$nfaset) $id + set subsets(nfa,$id) $nfaset + lappend pending $id + incr id + } + } + + while {[llength $pending]} { + set dfastate [struct::list shift pending] + + # We have to compute the transition function for this dfa state. + + set nfaset $subsets(nfa,$dfastate) + + foreach sym [$fa symbols@set $nfaset] { + set nfanext [lsort [$fa nextset $nfaset $sym]] + + if {![info exists subsets(dfa,$nfanext)]} { + # Unknown destination. Add it as a new state. + + $dfa state add $id + if {[$fa final?set $nfanext]} { + $dfa final add $id + } + + set subsets(dfa,$nfanext) $id + set subsets(nfa,$id) $nfanext + + # Schedule the calculation of the transition function + # of the new state. + + lappend pending $id + incr id + } + + # Add the transition + $dfa next $dfastate $sym --> $subsets(dfa,$nfanext) + } + } + + if {[llength [$fa startstates]]} {trim $fa !reachable} + if {[llength [$fa finalstates]]} {trim $fa !useful} + + if {$hasmap} { + # The map is from new dfa states to the sets of nfa states. + + set map {} + foreach s [$dfa states] { + lappend map $s $subsets(nfa,$s) + } + } + + $fa = $dfa + $dfa destroy + + # ASSERT : $fa is deterministic + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::minimize {fa {mapvar {}}} { + # Brzozowski's method: + # Reverse, determinize, reverse again, determinize again. + + reverse $fa + determinize $fa mapa + reverse $fa + determinize $fa mapb + + if {$mapvar ne ""} { + upvar 1 $mapvar map + + if {![llength $mapa] && ![llength $mapb]} { + # No state reorganizations, signal up + set map {} + } elseif {[llength $mapa] && ![llength $mapb]} { + # Only one reorg, this is the combined reorg as well. + set map $mapa + } elseif {![llength $mapa] && [llength $mapb]} { + # Only one reorg, this is the combined reorg as well. + set map $mapb + } else { + # Two reorgs. Compose the maps into the final map signaled + # up. + + # mapb : final state -> set of states in mapa -> sets of original states. + + set map {} + array set tmp $mapa + foreach {b aset} $mapb { + set compose {} + foreach a $aset {foreach o $tmp($a) {lappend compose $o}} + lappend map $b [lsort -uniq $compose] + } + } + } + + # The FA is implicitly trimmed by the determinize's. + return +} + +# ### ### ### ######### ######### ######### +## API implementation. Language. + +proc ::grammar::fa::op::complement {fa} { + # Complementing is possible if and only if the FA is complete, + # and accomplished by swapping the final and non-final states. + + if {![$fa is complete]} { + return -code error "Unable to complement incomplete FA" + } + if {![$fa is deterministic]} { + return -code error "Unable to complement non-deterministic FA" + } + + set newfinal [struct::set difference [$fa states] [$fa finalstates]] + $fa final set $newfinal + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::kleene {fa} { + # The Kleene Closure of the FA makes no sense if we don't have + # start and final states we can work from. + + set start [$fa startstates] + set final [$fa finalstates] + + if {![llength $start] || ![llength $final]} { + return -code error "Unable to add Kleene's closure to a FA without start/final states" + } + + # FUTURE :: If final states have no outgoing transitions, and start + # FUTURE :: states have no input transitions, then place the new + # FUTURE :: transitions directly between start and final + # FUTURE :: states. In that case we don't need new states. + + # We need new start/final states, like for optional (see below) + + set ns [NewState $fa s] + set nf [NewState $fa f] + + foreach s $start {$fa next $ns "" --> $s} + foreach f $final {$fa next $f "" --> $nf} + + $fa start clear ; $fa start add $ns + $fa final clear ; $fa final add $nf + + $fa next $ns "" --> $nf ; # Optionality + $fa next $nf "" --> $ns ; # Loop for closure + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::optional {fa} { + # The Optionality of the FA makes no sense if we don't have + # start and final states we can work from. + + set start [$fa startstates] + set final [$fa finalstates] + + if {![llength $start] || ![llength $final]} { + return -code error "Unable to make a FA without start/final states optional" + } + + # We have to introduce new start and final states to ensure + # that we do not get additional recognized words from the FA + # due to epsilon transitions. IOW just placing epsilons from + # all start to all final states is wrong. Consider unreachable + # final states, they become reachable. Or final states able to + # reach final states from. Again the epsilons would extend the + # language. We have to detach our optional epsilon from anything + # in the existing start/final states. Hence the new start/final. + + # FUTURE : Recognize if there are no problems with placing direct + # FUTURE : epsilons from start to final. + + set ns [NewState $fa s] + set nf [NewState $fa f] + + foreach s $start {$fa next $ns "" --> $s} + foreach f $final {$fa next $f "" --> $nf} + + $fa start clear ; $fa start add $ns + $fa final clear ; $fa final add $nf + + $fa next $ns "" --> $nf ; # This is the transition which creates the optionality. + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::union {fa fb {mapvar {}}} { + # We union the input symbols, then add the states and + # transitions of the second FA to the first, adding in + # epsilons for the start and final states as well. When + # adding states we make sure that the new states do not + # intersect with the existing states. + + struct::list assign \ + [MergePrepare $fa $fb union smap] \ + astart afinal bstart bfinal + + if {$mapvar ne ""} { + upvar 1 $mapvar map + set map $smap + } + + # And now the new start & final states + + set ns [NewState $fa s] + set nf [NewState $fa f] + + eLink1N $fa $ns $astart + eLink1N $fa $ns $bstart + + eLinkN1 $fa $afinal $nf + eLinkN1 $fa $bfinal $nf + + $fa start clear ; $fa start add $ns + $fa final clear ; $fa final add $nf + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::intersect {fa fb {mapvar {}} {idstart 0}} { + # Intersection has to run the two automata in parallel, using + # paired states. If we have start states we begin the + # construction with them. This leads to a smaller result as we + # do not have create a full cross-crossproduct. The latter is + # unfortunately required if there are no start states. + + struct::list assign [CrossPrepare $fa $fb intersection] tmp res + + # The start states of the new FA consist of the cross-product of + # the start states of fa with fb. These are also the states used + # to seed DoCross. + + set id $idstart + set smap {} + set bstart [$tmp startstates] + foreach a [$fa startstates] { + foreach b $bstart { + set pair [list $a $b] + lappend smap $id $pair + lappend pending $pair $id + $res state add $id + $res start add $id + incr id + } + } + + set cp [DoCross $fa $tmp $res $id $pending smap] + + foreach {id pair} $smap { + struct::list assign $pair a b + if {[$fa final? $a] && [$tmp final? $b]} { + $res final add $id + } + } + + # Remove excess states (generated because of the sinks). + trim $res + if {$mapvar ne ""} { + upvar 1 $mapvar map + # The loop is required to filter out the mappings for all + # states which were trimmed off. + set map {} + foreach {id pair} $smap { + if {![$res state exists $id]} continue + lappend map $id $pair + } + } + + # Copy result into permanent storage and delete all intermediaries + $fa = $res + $res destroy + if {$tmp ne $fb} {$tmp destroy} + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::difference {fa fb {mapvar {}}} { + # Difference has to run the two automata in parallel, using + # paired states. Only the final states are defined differently + # than for intersection. It has to be final in fa and _not_ final + # in fb to be a final state of the result. <=> Accepted by A, but + # not B, to be in the difference. + + struct::list assign [CrossPrepare $fa $fb difference] tmp res + + # The start states of the new FA consist of the cross-product of + # the start states of fa with fb. These are also the states used + # to seed DoCross. + + set id 0 + set smap {} + set bstart [$tmp startstates] + foreach a [$fa startstates] { + foreach b $bstart { + set pair [list $a $b] + lappend smap $id $pair + lappend pending $pair $id + $res state add $id + $res start add $id + incr id + } + } + + set cp [DoCross $fa $tmp $res $id $pending smap] + + foreach {id pair} $smap { + struct::list assign $pair a b + if {[$fa final? $a] && ![$tmp final? $b]} { + $res final add $id + } + } + + # Remove excess states (generated because of the sinks). + trim $res + if {$mapvar ne ""} { + upvar 1 $mapvar map + # The loop is required to filter out the mappings for all + # states which were trimmed off. + set map {} + foreach {id pair} $smap { + if {![$res state exists $id]} continue + lappend map $id $pair + } + } + + # Copy result into permanent storage and delete all intermediaries + $fa = $res + $res destroy + if {$tmp ne $fb} {$tmp destroy} + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::concatenate {fa fb {mapvar {}}} { + # Like union, only the interconnect between existing and new FA is different. + + struct::list assign \ + [MergePrepare $fa $fb concatenate smap] \ + astart afinal bstart bfinal + + if {$mapvar ne ""} { + upvar 1 $mapvar map + set map $smap + } + + set ns [NewState $fa s] + set nm [NewState $fa m] ;# Midpoint. + set nf [NewState $fa f] + + eLink1N $fa $ns $astart + eLinkN1 $fa $afinal $nm + + eLink1N $fa $nm $bstart + eLinkN1 $fa $bfinal $nf + + $fa start clear ; $fa start add $ns + $fa final clear ; $fa final add $nf + return +} + +# ### ### ### ######### ######### ######### +## API implementation. Compilation (regexp -> FA). + +proc ::grammar::fa::op::fromRegex {fa regex {over {}}} { + # Convert a regular expression into a FA. The regex is given as + # parse tree in the form of a nested list. + + # {. A B ...} ... Concatenation (accepts zero|one arguments). + # {| A B ...} ... Alternatives (accepts zero|one arguments). + # {? A} ... Optional. + # {* A} ... Kleene. + # {+ A} ... Pos.Kleene. + # {! A} ... Complement/Negation. + # {S Symbol} ... Atom, Symbol + # + # Recursive descent with a helper ... + + if {![llength $regex]} { + $fa clear + return + } + + set tmp [[cons] %AUTO%] + + if {![llength $over]} { + set over [lsort -uniq [RESymbols $regex]] + } + foreach sym $over { + $tmp symbol add $sym + } + + set id 0 + struct::list assign [Regex $tmp $regex id] s f + $tmp start set [list $s] + $tmp final set [list $f] + + $fa = $tmp + $tmp destroy + return +} + +# ### ### ### ######### ######### ######### +## Internal helpers. + +proc ::grammar::fa::op::RESymbols {regex} { + set cmd [lindex $regex 0] + switch -exact -- $cmd { + ? - * - ! - + { + return [RESymbols [lindex $regex 1]] + } + . - | - & { + set res {} + foreach sub [lrange $regex 1 end] { + foreach sym [RESymbols $sub] {lappend res $sym} + } + return $res + } + S { + return [list [lindex $regex 1]] + } + default { + return -code error "Expected . ! ? * | &, or S, got \"$cmd\"" + } + } +} + +proc ::grammar::fa::op::Regex {fa regex idvar} { + upvar 1 $idvar id + set cmd [lindex $regex 0] + switch -exact -- $cmd { + ? { + # Optional + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + + struct::list assign [Regex $fa [lindex $regex 1] id] s f + $fa next $a "" --> $s + $fa next $f "" --> $b + $fa next $a "" --> $b + } + * { + # Kleene + set a $id ; incr id ; $fa state add $a + set b $a + + struct::list assign [Regex $fa [lindex $regex 1] id] s f + $fa next $a "" --> $s + $fa next $f "" --> $a ;# == b + } + + { + # Pos. Kleene + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + + struct::list assign [Regex $fa [lindex $regex 1] id] s f + $fa next $a "" --> $s + $fa next $f "" --> $b + $fa next $b "" --> $a + } + ! { + # Complement. + # Build up in a temp FA, complement, and + # merge nack into the current + + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + + set tmp [[cons] %AUTO%] + foreach sym [$fa symbols] {$tmp symbol add $sym} + struct::list assign [Regex $tmp [lindex $regex 1] id] s f + $tmp start add $s + $tmp final add $f + + determinize $tmp {} $id + incr id [llength [$tmp states]] + if {![$tmp is complete]} { + complete $tmp $id + incr id + } + complement $tmp + + # Merge and link. + $fa deserialize_merge [$tmp serialize] + + eLink1N $fa $a [$tmp startstates] + eLinkN1 $fa [$tmp finalstates] $b + $tmp destroy + } + & { + # Intersection ... /And + + if {[llength $regex] < 3} { + # Optimized path. Intersection of one sub-expression + # is the sub-expression itself. + + struct::list assign [Regex $fa [lindex $regex 1] id] a b + } else { + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + + set tmp [[cons] %AUTO%] + foreach sym [$fa symbols] {$tmp symbol add $sym} + set idsub 0 + struct::list assign [Regex $tmp [lindex $regex 1] idsub] s f + $tmp start add $s + $tmp final add $f + + set beta [[cons] %AUTO%] + foreach sub [lrange $regex 2 end] { + foreach sym [$fa symbols] {$beta symbol add $sym} + struct::list assign [Regex $beta $sub idsub] s f + $beta start add $s + $beta final add $f + intersect $tmp $beta {} $id + } + $beta destroy + determinize $tmp {} $id + incr id [llength [$tmp states]] + + # Merge and link. + $fa deserialize_merge [$tmp serialize] + + eLink1N $fa $a [$tmp startstates] + eLinkN1 $fa [$tmp finalstates] $b + $tmp destroy + } + } + . { + # Concatenation ... + + if {[llength $regex] == 1} { + # Optimized path. No sub-expressions. This represents + # language containing only the empty string, aka + # epsilon. + + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + $fa next $a "" --> $b + + } elseif {[llength $regex] == 2} { + # Optimized path. Concatenation of one sub-expression + # is the sub-expression itself. + + struct::list assign [Regex $fa [lindex $regex 1] id] a b + } else { + set first 1 + set last {} + foreach sub [lrange $regex 1 end] { + struct::list assign [Regex $fa $sub id] s f + if {$first} {set first 0 ; set a $s} + if {$last != {}} { + $fa next $last "" --> $s + } + set last $f + } + set b $f + } + } + | { + # Alternatives ... (Union) + + if {[llength $regex] == 1} { + # Optimized path. No sub-expressions. This represents + # the empty language, i.e. the language without words. + + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + + } elseif {[llength $regex] == 2} { + # Optimized path. Choice/Union of one sub-expression + # is the sub-expression itself. + + struct::list assign [Regex $fa [lindex $regex 1] id] a b + } else { + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + foreach sub [lrange $regex 1 end] { + struct::list assign [Regex $fa $sub id] s f + $fa next $a "" --> $s + $fa next $f "" --> $b + } + } + } + S { + # Atom, base transition. + set sym [lindex $regex 1] + set a $id ; incr id ; $fa state add $a + set b $id ; incr id ; $fa state add $b + $fa next $a $sym --> $b + } + default { + return -code error "Expected . ! ? * | &, or S, got \"$cmd\"" + } + } + return [list $a $b] +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::CrossPrepare {fa fb label} { + set starta [$fa startstates] + set finala [$fa finalstates] + set startb [$fb startstates] + set finalb [$fb finalstates] + if { + ![llength $starta] || ![llength $finala] || + ![llength $startb] || ![llength $finalb] + } { + return -code error "Unable to perform the $label of two FAs without start/final states" + } + + # The inputs are made complete over the union of their symbol + # sets. A temp. container is used for the second input if necessary. + + set totals [struct::set union [$fa symbols] [$fb symbols]] + foreach sym [struct::set difference $totals [$fa symbols]] { + $fa symbol add $sym + } + if {![$fa is epsilon-free]} { + remove_eps $fa + trim $fa + } + if {![$fa is complete]} { + complete $fa + } + set tmp $fb + set bnew [struct::set difference $totals [$fb symbols]] + if {[llength $bnew]} { + set tmp [[cons] %AUTO% = $fb] + foreach sym $bnew { + $tmp symbol add $sym + } + } + if {![$fb is epsilon-free]} { + if {$tmp eq $fb} {set tmp [[cons] %AUTO% = $fb]} + remove_eps $tmp + trim $tmp + } + if {![$fb is complete]} { + if {$tmp eq $fb} {set tmp [[cons] %AUTO% = $fb]} + complete $tmp + } + + set res [[cons] %AUTO%] + foreach sym $totals { + $res symbol add $sym + } + + return [list $tmp $res] +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::DoCross {fa fb res id seed smapvar} { + upvar 1 $smapvar smap + + set symbols [$fa symbols] + array set tmp $seed + + set pending $seed + while {[llength $pending]} { + set cpair [struct::list shift pending] + set cid [struct::list shift pending] + + struct::list assign $cpair a b + + # ASSERT: /res state exists /cid + + # Generate the transitions for the pair, add the resulting + # destinations to the FA, and schedule them for a visit if + # they are new. + + foreach sym $symbols { + set adestinations [$fa next $a $sym] + set bdestinations [$fb next $b $sym] + + foreach ad $adestinations { + foreach bd $bdestinations { + set dest [list $ad $bd] + + if {![info exists tmp($dest)]} { + $res state add $id + lappend smap $id $dest + lappend pending $dest $id + set tmp($dest) $id + incr id + } + $res next $cid $sym --> $tmp($dest) + } + } + } + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::MergePrepare {fa fb label mapvar} { + upvar 1 $mapvar map + + set starta [$fa startstates] + set finala [$fa finalstates] + set startb [$fb startstates] + set finalb [$fb finalstates] + if { + ![llength $starta] || ![llength $finala] || + ![llength $startb] || ![llength $finalb] + } { + return -code error "Unable to $label FAs without start/final states" + } + + # FUTURE: add {*}[symbols], ignore dup's + foreach sym [$fb symbols] {catch {$fa symbol add $sym}} + + set dup [struct::set intersect [$fa states] [$fb states]] + if {![llength $dup]} { + # The states do not overlap. A plain merge of fb is enough to + # copy the information. + + $fa deserialize_merge [$fb serialize] + set map {} + } else { + # We have duplicate states, therefore we have to remap fb to + # prevent interference between the two. + + set map {} + set tmp [[cons] %AUTO% = $fb] + set id 0 + foreach s $dup { + # The renaming process has to ensure that the new name is + # in neither fa, nor already in fb as well. + while { + [$fa state exists $id] || + [$tmp state exists $id] + } {incr id} + $tmp state rename $s $id + lappend map $id $s + incr id + } + + set startb [$tmp startstates] + set finalb [$tmp finalstates] + + $fa deserialize_merge [$tmp serialize] + $tmp destroy + } + + return [list $starta $finala $startb $finalb] +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::eLink1N {fa from states} { + foreach s $states { + $fa next $from "" --> $s + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::eLinkN1 {fa states to} { + foreach s $states { + $fa next $s "" --> $to + } + return +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::NewState {fa prefix} { + set newstate [FindNewState $fa $prefix] + $fa state add $newstate + return $newstate +} + +# --- --- --- --------- --------- --------- + +proc ::grammar::fa::op::FindNewState {fa prefix} { + #if {![$fa state exists $prefix]} {return $prefix} + set n 0 + while {[$fa state exists ${prefix}.$n]} {incr n} + return ${prefix}.$n +} + +# ### ### ### ######### ######### ######### +## API implementation. Decompilation (FA -> regexp). + +proc ::grammar::fa::op::toRegexp {fa} { + # NOTE: FUTURE - Do not go through the serialization, nor through + # a matrix. The algorithm can be expressed more directly as + # operations on the automaton (states and transitions). + + set ET [ser_to_ematrix [$fa serialize]] + while {[llength $ET] > 2} { + set ET [matrix_drop_state $ET] + } + return [lindex $ET 0 1] +} + +proc ::grammar::fa::op::toRegexp2 {fa} { + # NOTE: FUTURE - See above. + set ET [ser_to_ematrix [$fa serialize]] + while {[llength $ET] > 2} { + set ET [matrix_drop_state $ET re2] + } + return [lindex $ET 0 1] +} + +# ### ### ### ######### ######### ######### +## Internal helpers. + +proc ::grammar::fa::op::ser_to_ematrix {ser} { + if {[lindex $ser 0] ne "grammar::fa"} then { + error "Expected grammar::fa automaton serialisation" + } + set stateL {} + set n 2; foreach {state des} [lindex $ser 2] { + lappend stateL $state + set N($state) $n + incr n + } + set row0 {} + for {set k 0} {$k<$n} {incr k} {lappend row0 [list |]} + set res [list $row0 $row0] + foreach {from des} [lindex $ser 2] { + set row [lrange $row0 0 1] + if {[lindex $des 0]} then {lset res 0 $N($from) [list .]} + if {[lindex $des 1]} then {lset row 1 [list .]} + foreach to $stateL {set S($to) [list |]} + foreach {symbol targetL} [lindex $des 2] { + if {$symbol eq ""} then { + set atom [list .] + } else { + set atom [list S $symbol] + } + foreach to $targetL {lappend S($to) $atom} + } + foreach to $stateL { + if {[llength $S($to)] == 2} then { + lappend row [lindex $S($to) 1] + } else { + lappend row $S($to) + } + } + lappend res $row + } + return $res +} + +proc ::grammar::fa::op::matrix_drop_state {T_in {ns re1}} { + set sumcmd ${ns}::| + set prodcmd ${ns}::. + set T1 {} + set lastcol {} + foreach row $T_in { + lappend T1 [lreplace $row end end] + lappend lastcol [lindex $row end] + } + set lastrow [lindex $T1 end] + set T1 [lreplace $T1 end end] + set b [${ns}::* [lindex $lastcol end]] + set lastcol [lreplace $lastcol end end] + set res {} + foreach row $T1 a $lastcol { + set newrow {} + foreach pos $row c $lastrow { + lappend newrow [$sumcmd $pos [$prodcmd $a $b $c]] + } + lappend res $newrow + } + return $res +} + +# ### ### ### ######### ######### ######### +## Internal helpers. Regexp simplification I. + +namespace eval ::grammar::fa::op::re1 { + namespace export | . {\*} +} + +proc ::grammar::fa::op::re1::| {args} { + set L {} + + # | = Choices. + # Sub-choices are lifted into the top expression (foreach). + # Identical choices are reduced to a single term (lsort -uniq). + + foreach re $args { + switch -- [lindex $re 0] "|" { + foreach term [lrange $re 1 end] {lappend L $term} + } default { + lappend L $re + } + } + set L [lsort -unique $L] + if {[llength $L] == 1} then { + return [lindex $L 0] + } else { + return [linsert $L 0 |] + } +} + +proc ::grammar::fa::op::re1::. {args} { + set L {} + + # . = Sequence. + # One element sub-choices are lifted into the top expression. + # Sub-sequences are lifted into the top expression. + + foreach re $args { + switch -- [lindex $re 0] "." { + foreach term [lrange $re 1 end] {lappend L $term} + } "|" { + if {[llength $re] == 1} then {return $re} + lappend L $re + } default { + lappend L $re + } + } + if {[llength $L] == 1} then { + return [lindex $L 0] + } else { + return [linsert $L 0 .] + } +} + +proc ::grammar::fa::op::re1::* {re} { + # * = Kleene closure. + # Sub-closures are lifted into the top expression. + # One-element sub-(choices,sequences) are lifted into the top expression. + + switch -- [lindex $re 0] "|" - "." { + if {[llength $re] == 1} then { + return [list .] + } else { + return [list * $re] + } + } "*" { + return $re + } default { + return [list * $re] + } +} + +# ### ### ### ######### ######### ######### +## Internal helpers. Regexp simplification II. + +namespace eval ::grammar::fa::op::re2 { + # Inherit choices and kleene-closure from the basic simplifier. + + namespace import [namespace parent]::re1::| + namespace import [namespace parent]::re1::\\* +} + +proc ::grammar::fa::op::re2::. {args} { + + # . = Sequences + # Sub-sequences are lifted into the top expression. + # Sub-choices are multiplied out. + # <Example a(b|c) => ab|ac > + + set L {} + set n -1 + foreach re $args { + incr n + switch -- [lindex $re 0] "." { + foreach term [lrange $re 1 end] {lappend L $term} + } "|" { + set res [list |] + set L2 [lreplace $args 0 $n] + foreach term [lrange $re 1 end] { + lappend res [eval [list .] $L [list $term] $L2] + } + return [eval $res] + } default { + lappend L $re + } + } + if {[llength $L] == 1} then { + return [lindex $L 0] + } else { + return [linsert $L 0 .] + } +} + +# ### ### ### ######### ######### ######### +## API. Simplification of regular expressions. + +proc ::grammar::fa::op::simplifyRegexp {RE0} { + set RE1 [namespace inscope nonnull $RE0] + if {[lindex $RE1 0] eq "S" || $RE1 eq "." || $RE1 eq "|"} then { + return $RE1 + } + set tmp [grammar::fa %AUTO% fromRegex $RE1] + $tmp minimize + set RE1 [toRegexp $tmp] + $tmp destroy + if {[string length $RE1] < [string length $RE0]} then { + set RE0 $RE1 + } + if {[lindex $RE0 0] eq "S"} then {return $RE0} + set res [lrange $RE0 0 0] + foreach branch [lrange $RE0 1 end] { + lappend res [simplifyRegexp $branch] + } + return $res +} + +# ### ### ### ######### ######### ######### +## Internal helpers. + +namespace eval ::grammar::fa::op::nonnull {} + +proc ::grammar::fa::op::nonnull::| {args} { + set also_empty false + set res [list |] + foreach branch $args { + set RE [eval $branch] + if {[lindex $RE 0] eq "?"} then { + set also_empty true + set RE [lindex $RE 1] + } + switch -- [lindex $RE 0] "|" { + eval [lreplace $RE 0 0 lappend res] + } "." { + if {[llength $RE] == 1} then { + set also_empty true + } else { + lappend res $RE + } + } default { + lappend res $RE + } + } + if {!$also_empty} then {return $res} + foreach branch [lrange $res 1 end] { + if {[lindex $branch 0] eq "*"} then {return $res} + } + if {[llength $res] == 1} then { + return [list .] + } elseif {[llength $res] == 2} then { + return [lreplace $res 0 0 ?] + } else { + return [list ? $res] + } +} + +proc ::grammar::fa::op::nonnull::. {args} { + set res [list .] + foreach branch $args { + set RE [eval $branch] + switch -- [lindex $RE 0] "|" { + if {[llength $RE] == 1} then {return $RE} + lappend res $RE + } "." { + eval [lreplace $RE 0 0 lappend res] + } default { + lappend res $RE + } + } + return $res +} + +proc ::grammar::fa::op::nonnull::* {sub} { + set RE [eval $sub] + switch -- [lindex $RE 0] "*" - "?" - "+" { + return [lreplace $RE 0 0 *] + } default { + return [list * $RE] + } +} + +proc ::grammar::fa::op::nonnull::+ {sub} { + set RE [eval $sub] + switch -- [lindex $RE 0] "+" { + return $RE + } "*" - "?" { + return [lreplace $RE 0 0 *] + } default { + return [list * $RE] + } +} + +proc ::grammar::fa::op::nonnull::? {sub} { + set RE [eval $sub] + switch -- [lindex $RE 0] "?" - "*" { + return $RE + } "+" { + return [lreplace $RE 0 0 *] + } default { + return [list ? $RE] + } +} + +proc ::grammar::fa::op::nonnull::S {name} { + return [list S $name] +} + +# ### ### ### ######### ######### ######### +## API. Translate RE of this package to Tcl REs + +proc ::grammar::fa::op::toTclRegexp {re symdict} { + return [lindex [namespace inscope tclre $re $symdict] 1] +} + +# ### ### ### ######### ######### ######### +## Internal helpers. + +namespace eval ::grammar::fa::op::tclre {} + +proc ::grammar::fa::op::tclre::S {name dict} { + array set A $dict + if {[info exists A($name)]} then { + return $A($name) + } elseif {[string length $name] == 1} then { + if {[regexp {[\\\[\]{}.()*+?^$]} $name]} then { + return [list char \\$name] + } else { + return [list char $name] + } + } else { + return [list class "\[\[:${name}:\]\]"] + } +} + +proc ::grammar::fa::op::tclre::. {args} { + set suffix [lrange $args end end] + set L {} + foreach factor [lrange $args 0 end-1] { + set pair [eval $factor $suffix] + switch -- [lindex $pair 0] "sum" { + lappend L ([lindex $pair 1]) + } default { + lappend L [lindex $pair 1] + } + } + return [list prod [join $L ""]] +} + +proc ::grammar::fa::op::tclre::* {re dict} { + set pair [eval $re [list $dict]] + switch -- [lindex $pair 0] "sum" - "prod" { + return [list prod "([lindex $pair 1])*"] + } default { + return [list prod "[lindex $pair 1]*"] + } +} + +proc ::grammar::fa::op::tclre::+ {re dict} { + set pair [eval $re [list $dict]] + switch -- [lindex $pair 0] "sum" - "prod" { + return [list prod "([lindex $pair 1])+"] + } default { + return [list prod "[lindex $pair 1]+"] + } +} + +proc ::grammar::fa::op::tclre::? {re dict} { + set pair [eval $re [list $dict]] + switch -- [lindex $pair 0] "sum" - "prod" { + return [list prod "([lindex $pair 1])?"] + } default { + return [list prod "[lindex $pair 1]?"] + } +} + +proc ::grammar::fa::op::tclre::| {args} { + set suffix [lrange $args end end] + set charL {} + set classL {} + set prodL {} + foreach factor [lrange $args 0 end-1] { + set pair [eval $factor $suffix] + switch -- [lindex $pair 0] "char" { + lappend charL [lindex $pair 1] + } "class" { + lappend classL [string range [lindex $pair 1] 1 end-1] + } default { + lappend prodL [lindex $pair 1] + } + } + if {[llength $charL]>1 || [llength $classL]>0} then { + while {[set n [lsearch $charL -]] >= 0} { + lset charL $n {\-} + } + set bracket "\[[join $charL ""][join $classL ""]\]" + if {![llength $prodL]} then { + return [list atom $bracket] + } + lappend prodL $bracket + } else { + eval [list lappend prodL] $charL + } + return [list sum [join $prodL |]] +} + +proc ::grammar::fa::op::tclre::& {args} { + error "Cannot express language intersection in Tcl-RE's" + + # Note: This can be translated by constructing an automaton for + # the intersection, and then translating its conversion to a + # regular expression. +} + +proc ::grammar::fa::op::tclre::! {args} { + error "Cannot express language complementation in Tcl-RE's" + + # Note: This can be translated by constructing an automaton for + # the complement, and then translating its conversion to a regular + # expression. This however requires knowledge regarding the set of + # symbols. Large (utf-8) for Tcl regexes. +} + +# ### ### ### ######### ######### ######### + +proc ::grammar::fa::op::constructor {cmd} { + variable cons $cmd + return +} + +proc ::grammar::fa::op::cons {} { + variable cons + if {$cons ne ""} {return $cons} + return -code error "No constructor for FA container was established." +} + +# ### ### ### ######### ######### ######### +## Package Management + +package provide grammar::fa::op 0.4.1 |