summaryrefslogtreecommitdiffstats
path: root/Lib/sgmllib.py
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2006-06-29 00:51:53 (GMT)
committerFred Drake <fdrake@acm.org>2006-06-29 00:51:53 (GMT)
commita136210a9fe9aff79dd3457a53eae35a823f97fc (patch)
tree27a8c98aefe6b85b90ee2cb47cb2cad610ea7091 /Lib/sgmllib.py
parent960a3f88e5f49474f046d7e9bb9133f0ecbda5a5 (diff)
downloadcpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.zip
cpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.tar.gz
cpython-a136210a9fe9aff79dd3457a53eae35a823f97fc.tar.bz2
SF bug #1504333: sgmlib should allow angle brackets in quoted values
(modified patch by Sam Ruby; changed to use separate REs for start and end tags to reduce matching cost for end tags; extended tests; updated to avoid breaking previous changes to support IPv6 addresses in unquoted attribute values)
Diffstat (limited to 'Lib/sgmllib.py')
-rw-r--r--Lib/sgmllib.py19
1 files changed, 10 insertions, 9 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 3ab57c2..3020d11 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -29,7 +29,12 @@ starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
-endbracket = re.compile('[<>]')
+starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
+ r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
+ r')*\s*/?\s*(?=[<>])')
+endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -249,14 +254,10 @@ class SGMLParser(markupbase.ParserBase):
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
- # XXX The following should skip matching quotes (' or ")
- # As a shortcut way to exit, this isn't so bad, but shouldn't
- # be used to locate the actual end of the start tag since the
- # < or > characters may be embedded in an attribute value.
- match = endbracket.search(rawdata, i+1)
+ match = starttag.match(rawdata, i)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
@@ -305,10 +306,10 @@ class SGMLParser(markupbase.ParserBase):
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endbracket.search(rawdata, i+1)
+ match = endtag.match(rawdata, i)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1