# A parser for SGML, using the derived class as static DTD. # XXX This only supports those SGML features used by HTML. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import regex import string # Regular expressions used for parsing interesting = regex.compile('[&<]') incomplete = regex.compile('&\([a-zA-Z][a-zA-Z0-9]*\|#[0-9]*\)?\|' '<\([a-zA-Z][^<>]*\|' '/\([a-zA-Z][^<>]*\)?\|' '![^<>]*\)?') entityref = regex.compile('&\([a-zA-Z][a-zA-Z0-9]*\)[^a-zA-Z0-9]') charref = regex.compile('&#\([0-9]+\)[^0-9]') starttagopen = regex.compile('<[>a-zA-Z]') shorttagopen = regex.compile('<[a-zA-Z][a-zA-Z0-9]*/') shorttag = regex.compile('<\([a-zA-Z][a-zA-Z0-9]*\)/\([^/]*\)/') endtagopen = regex.compile('a-zA-Z]') endbracket = regex.compile('[<>]') special = regex.compile(']*>') commentopen = regex.compile('