summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-09-24 20:15:51 (GMT)
committerFred Drake <fdrake@acm.org>2001-09-24 20:15:51 (GMT)
commita3bae3369cca83ee485ed3293e2ad4c53691ba1a (patch)
treee6fbd7fcc8376e81eb8752ecab218ef498843771
parentbfc8fea1e0c46bc0a337237c32b8c1a32985c144 (diff)
downloadcpython-a3bae3369cca83ee485ed3293e2ad4c53691ba1a.zip
cpython-a3bae3369cca83ee485ed3293e2ad4c53691ba1a.tar.gz
cpython-a3bae3369cca83ee485ed3293e2ad4c53691ba1a.tar.bz2
Re-factor the SGMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors; the new base class also uses this. Adjust the parse_comment() method to return the new offset into the buffer instead of the number of characters scanned; this was the only helper method that did it this way, so we have better consistency now. Required to share the new base class. This fixes SF bug #448482 and #453706.
-rw-r--r--Lib/sgmllib.py109
1 files changed, 34 insertions, 75 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index f2a3020..2de7492 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -9,6 +9,7 @@
# not supported at all.
+import markupbase
import re
__all__ = ["SGMLParser"]
@@ -27,24 +28,14 @@ charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
-piopen = re.compile('<\?')
piclose = re.compile('>')
-endtagopen = re.compile('</[<>a-zA-Z]')
endbracket = re.compile('[<>]')
-special = re.compile('<![^<>]*>')
-commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
-declopen = re.compile('<!')
-declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
-declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
-decldata = re.compile(r'[^>\'\"]+')
-declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
-
class SGMLParseError(RuntimeError):
"""Exception raised for all parse errors."""
@@ -62,7 +53,7 @@ class SGMLParseError(RuntimeError):
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
-class SGMLParser:
+class SGMLParser(markupbase.ParserBase):
def __init__(self, verbose=0):
"""Initialize and reset this instance."""
@@ -76,6 +67,7 @@ class SGMLParser:
self.lasttag = '???'
self.nomoretags = 0
self.literal = 0
+ markupbase.ParserBase.reset(self)
def setnomoretags(self):
"""Enter literal mode (CDATA) till EOF.
@@ -106,6 +98,9 @@ class SGMLParser:
"""Handle the remaining data."""
self.goahead(1)
+ def error(self, message):
+ raise SGMLParseError(message)
+
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@@ -119,9 +114,10 @@ class SGMLParser:
i = n
break
match = interesting.search(rawdata, i)
- if match: j = match.start(0)
+ if match: j = match.start()
else: j = n
- if i < j: self.handle_data(rawdata[i:j])
+ if i < j:
+ self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
@@ -134,36 +130,31 @@ class SGMLParser:
if k < 0: break
i = k
continue
- if endtagopen.match(rawdata, i):
+ if rawdata.startswith("</", i):
k = self.parse_endtag(i)
if k < 0: break
- i = k
+ i = k
self.literal = 0
continue
- if commentopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
+ if self.literal:
+ if n > (i + 1):
+ self.handle_data("<")
i = i+1
- continue
+ else:
+ # incomplete
+ break
+ continue
+ if rawdata.startswith("<!--", i):
k = self.parse_comment(i)
if k < 0: break
- i = i+k
+ i = k
continue
- if piopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
- i = i+1
- continue
+ if rawdata.startswith("<?", i):
k = self.parse_pi(i)
if k < 0: break
i = i+k
continue
- match = special.match(rawdata, i)
- if match:
- if self.literal:
- self.handle_data(rawdata[i])
- i = i+1
- continue
+ if rawdata.startswith("<!", i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
@@ -191,7 +182,7 @@ class SGMLParser:
if rawdata[i-1] != ';': i = i-1
continue
else:
- raise SGMLParseError('neither < nor & ??')
+ self.error('neither < nor & ??')
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
@@ -212,59 +203,26 @@ class SGMLParser:
# XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not terminated
- def parse_comment(self, i):
+ def parse_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+4] != '<!--':
- raise SGMLParseError('unexpected call to parse_comment()')
+ self.error('unexpected call to parse_comment()')
match = commentclose.search(rawdata, i+4)
if not match:
return -1
- j = match.start(0)
- self.handle_comment(rawdata[i+4: j])
- j = match.end(0)
- return j-i
+ if report:
+ j = match.start(0)
+ self.handle_comment(rawdata[i+4: j])
+ return match.end(0)
- # Internal -- parse declaration.
- def parse_declaration(self, i):
- # This is some sort of declaration; in "HTML as
- # deployed," this should only be the document type
- # declaration ("<!DOCTYPE html...>").
- rawdata = self.rawdata
- j = i + 2
- assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
- if rawdata[j:j+1] in ("-", ""):
- # Start of comment followed by buffer boundary,
- # or just a buffer boundary.
- return -1
- # in practice, this should look like: ((name|stringlit) S*)+ '>'
- n = len(rawdata)
- while j < n:
- c = rawdata[j]
- if c == ">":
- # end of declaration syntax
- self.handle_decl(rawdata[i+2:j])
- return j + 1
- if c in "\"'":
- m = declstringlit.match(rawdata, j)
- if not m:
- return -1 # incomplete
- j = m.end()
- elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
- m = declname.match(rawdata, j)
- if not m:
- return -1 # incomplete
- j = m.end()
- else:
- raise SGMLParseError(
- "unexpected char in declaration: %s" % `rawdata[j]`)
- # end of buffer between tokens
- return -1
+ # Extensions for the DOCTYPE scanner:
+ _decl_otherchars = '='
# Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
if rawdata[i:i+2] != '<?':
- raise SGMLParseError('unexpected call to parse_pi()')
+ self.error('unexpected call to parse_pi()')
match = piclose.search(rawdata, i+2)
if not match:
return -1
@@ -311,7 +269,7 @@ class SGMLParser:
else:
match = tagfind.match(rawdata, i+1)
if not match:
- raise SGMLParseError('unexpected call to parse_starttag')
+ self.error('unexpected call to parse_starttag')
k = match.end(0)
tag = rawdata[i+1:k].lower()
self.lasttag = tag
@@ -465,6 +423,7 @@ class SGMLParser:
def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass
+ def unknown_decl(self, data): pass
class TestSGMLParser(SGMLParser):