diff options
Diffstat (limited to 'Lib/sgmllib.py')
-rw-r--r-- | Lib/sgmllib.py | 99 |
1 files changed, 55 insertions, 44 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3e85a91..3020d11 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,11 +29,16 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -endbracket = re.compile('[<>]') +starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' + r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' + r')*\s*/?\s*(?=[<>])') +endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') class SGMLParseError(RuntimeError): @@ -53,6 +58,10 @@ class SGMLParseError(RuntimeError): # self.handle_entityref() with the entity reference as argument. class SGMLParser(markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') def __init__(self, verbose=0): """Initialize and reset this instance.""" @@ -245,11 +254,10 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - # XXX The following should skip matching quotes (' or ") - match = endbracket.search(rawdata, i+1) + match = starttag.match(rawdata, i) if not match: return -1 - j = match.start(0) + j = match.end(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -274,32 +282,8 @@ class SGMLParser(markupbase.ParserBase): attrvalue[:1] == '"' == attrvalue[-1:]): # strip quotes attrvalue = attrvalue[1:-1] - l = 0 - new_attrvalue = '' - while l < len(attrvalue): - av_match = entityref.match(attrvalue, l) - if (av_match and av_match.group(1) in self.entitydefs and - attrvalue[av_match.end(1)] == ';'): - # only substitute entityrefs ending in ';' since - # otherwise we may break <a href='?p=x&q=y'> - # which is very common - new_attrvalue += self.entitydefs[av_match.group(1)] - l = av_match.end(0) - continue - ch_match = charref.match(attrvalue, l) - if ch_match: - try: - char = chr(int(ch_match.group(1))) - new_attrvalue += char - l = ch_match.end(0) - continue - except ValueError: - # invalid character reference, don't substitute - pass - # all other cases - new_attrvalue += attrvalue[l] - l += 1 - attrvalue = new_attrvalue + attrvalue = self.entity_or_charref.sub( + self._convert_ref, attrvalue) attrs.append((attrname.lower(), attrvalue)) k = match.end(0) if rawdata[j] == '>': @@ -308,13 +292,24 @@ class SGMLParser(markupbase.ParserBase): self.finish_starttag(tag, attrs) return j + # Internal -- convert entity or character reference + def _convert_ref(self, match): + if match.group(2): + return self.convert_charref(match.group(2)) or \ + '&#%s%s' % match.groups()[1:] + elif match.group(3): + return self.convert_entityref(match.group(1)) or \ + '&%s;' % match.group(1) + else: + return '&%s' % match.group(1) + # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endbracket.search(rawdata, i+1) + match = endtag.match(rawdata, i) if not match: return -1 - j = match.start(0) + j = match.end(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 @@ -391,35 +386,51 @@ class SGMLParser(markupbase.ParserBase): print '*** Unbalanced </' + tag + '>' print '*** Stack:', self.stack - def handle_charref(self, name): - """Handle character reference, no need to override.""" + def convert_charref(self, name): + """Convert character reference, may be overridden.""" try: n = int(name) except ValueError: - self.unknown_charref(name) return if not 0 <= n <= 255: - self.unknown_charref(name) return - self.handle_data(chr(n)) + return self.convert_codepoint(n) + + def convert_codepoint(self, codepoint): + return chr(codepoint) + + def handle_charref(self, name): + """Handle character reference, no need to override.""" + replacement = self.convert_charref(name) + if replacement is None: + self.unknown_charref(name) + else: + self.handle_data(replacement) # Definition of entities -- derived classes may override entitydefs = \ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} - def handle_entityref(self, name): - """Handle entity references. + def convert_entityref(self, name): + """Convert entity references. - There should be no need to override this method; it can be - tailored by setting up the self.entitydefs mapping appropriately. + As an alternative to overriding this method; one can tailor the + results by setting up the self.entitydefs mapping appropriately. """ table = self.entitydefs if name in table: - self.handle_data(table[name]) + return table[name] else: - self.unknown_entityref(name) return + def handle_entityref(self, name): + """Handle entity references, no need to override.""" + replacement = self.convert_entityref(name) + if replacement is None: + self.unknown_entityref(name) + else: + self.handle_data(self.convert_entityref(name)) + # Example -- handle data, should be overridden def handle_data(self, data): pass |