summaryrefslogtreecommitdiffstats
path: root/Lib/sgmllib.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sgmllib.py')
-rw-r--r--Lib/sgmllib.py99
1 files changed, 55 insertions, 44 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 3e85a91..3020d11 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -29,11 +29,16 @@ starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
-endbracket = re.compile('[<>]')
+starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
+ r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
+ r')*\s*/?\s*(?=[<>])')
+endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError):
@@ -53,6 +58,10 @@ class SGMLParseError(RuntimeError):
# self.handle_entityref() with the entity reference as argument.
class SGMLParser(markupbase.ParserBase):
+ # Definition of entities -- derived classes may override
+ entity_or_charref = re.compile('&(?:'
+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+ ')(;?)')
def __init__(self, verbose=0):
"""Initialize and reset this instance."""
@@ -245,11 +254,10 @@ class SGMLParser(markupbase.ParserBase):
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
- # XXX The following should skip matching quotes (' or ")
- match = endbracket.search(rawdata, i+1)
+ match = starttag.match(rawdata, i)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
@@ -274,32 +282,8 @@ class SGMLParser(markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes
attrvalue = attrvalue[1:-1]
- l = 0
- new_attrvalue = ''
- while l < len(attrvalue):
- av_match = entityref.match(attrvalue, l)
- if (av_match and av_match.group(1) in self.entitydefs and
- attrvalue[av_match.end(1)] == ';'):
- # only substitute entityrefs ending in ';' since
- # otherwise we may break <a href='?p=x&q=y'>
- # which is very common
- new_attrvalue += self.entitydefs[av_match.group(1)]
- l = av_match.end(0)
- continue
- ch_match = charref.match(attrvalue, l)
- if ch_match:
- try:
- char = chr(int(ch_match.group(1)))
- new_attrvalue += char
- l = ch_match.end(0)
- continue
- except ValueError:
- # invalid character reference, don't substitute
- pass
- # all other cases
- new_attrvalue += attrvalue[l]
- l += 1
- attrvalue = new_attrvalue
+ attrvalue = self.entity_or_charref.sub(
+ self._convert_ref, attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
@@ -308,13 +292,24 @@ class SGMLParser(markupbase.ParserBase):
self.finish_starttag(tag, attrs)
return j
+ # Internal -- convert entity or character reference
+ def _convert_ref(self, match):
+ if match.group(2):
+ return self.convert_charref(match.group(2)) or \
+ '&#%s%s' % match.groups()[1:]
+ elif match.group(3):
+ return self.convert_entityref(match.group(1)) or \
+ '&%s;' % match.group(1)
+ else:
+ return '&%s' % match.group(1)
+
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endbracket.search(rawdata, i+1)
+ match = endtag.match(rawdata, i)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
@@ -391,35 +386,51 @@ class SGMLParser(markupbase.ParserBase):
print '*** Unbalanced </' + tag + '>'
print '*** Stack:', self.stack
- def handle_charref(self, name):
- """Handle character reference, no need to override."""
+ def convert_charref(self, name):
+ """Convert character reference, may be overridden."""
try:
n = int(name)
except ValueError:
- self.unknown_charref(name)
return
if not 0 <= n <= 255:
- self.unknown_charref(name)
return
- self.handle_data(chr(n))
+ return self.convert_codepoint(n)
+
+ def convert_codepoint(self, codepoint):
+ return chr(codepoint)
+
+ def handle_charref(self, name):
+ """Handle character reference, no need to override."""
+ replacement = self.convert_charref(name)
+ if replacement is None:
+ self.unknown_charref(name)
+ else:
+ self.handle_data(replacement)
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
- def handle_entityref(self, name):
- """Handle entity references.
+ def convert_entityref(self, name):
+ """Convert entity references.
- There should be no need to override this method; it can be
- tailored by setting up the self.entitydefs mapping appropriately.
+ As an alternative to overriding this method; one can tailor the
+ results by setting up the self.entitydefs mapping appropriately.
"""
table = self.entitydefs
if name in table:
- self.handle_data(table[name])
+ return table[name]
else:
- self.unknown_entityref(name)
return
+ def handle_entityref(self, name):
+ """Handle entity references, no need to override."""
+ replacement = self.convert_entityref(name)
+ if replacement is None:
+ self.unknown_entityref(name)
+ else:
+ self.handle_data(self.convert_entityref(name))
+
# Example -- handle data, should be overridden
def handle_data(self, data):
pass