diff options
Diffstat (limited to 'Lib/html')
-rw-r--r-- | Lib/html/entities.py | 3 | ||||
-rw-r--r-- | Lib/html/parser.py | 114 |
2 files changed, 16 insertions, 101 deletions
diff --git a/Lib/html/entities.py b/Lib/html/entities.py index e891ad6..cbf4f76 100644 --- a/Lib/html/entities.py +++ b/Lib/html/entities.py @@ -1,5 +1,8 @@ """HTML character entity references.""" +__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] + + # maps the HTML entity name to the Unicode codepoint name2codepoint = { 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a650d5e..390d4cc 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change tagfind/attrfind remember to update locatestarttagend too; -# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -79,25 +59,6 @@ endendtag = re.compile('>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - -_default_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -123,27 +84,12 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_default_sentinel, *, - convert_charrefs=_default_sentinel): + def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. - If convert_charrefs is True (default: False), all character references + If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. - If strict is set to False (the default) the parser will parse invalid - markup, otherwise it will raise an error. Note that the strict mode - and argument are deprecated. """ - if strict is not _default_sentinel: - warnings.warn("The strict argument and mode are deprecated.", - DeprecationWarning, stacklevel=2) - else: - strict = False # default - self.strict = strict - if convert_charrefs is _default_sentinel: - convert_charrefs = False # default - warnings.warn("The value of convert_charrefs will become True in " - "3.5. You are encouraged to set the value explicitly.", - DeprecationWarning, stacklevel=2) self.convert_charrefs = convert_charrefs self.reset() @@ -168,11 +114,6 @@ class HTMLParser(_markupbase.ParserBase): """Handle any buffered data.""" self.goahead(1) - def error(self, message): - warnings.warn("The 'error' method is deprecated.", - DeprecationWarning, stacklevel=2) - raise HTMLParseError(message, self.getpos()) - __starttag_text = None def get_starttag_text(self): @@ -227,10 +168,7 @@ class HTMLParser(_markupbase.ParserBase): elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): - if self.strict: - k = self.parse_declaration(i) - else: - k = self.parse_html_declaration(i) + k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -239,8 +177,6 @@ class HTMLParser(_markupbase.ParserBase): if k < 0: if not end: break - if self.strict: - self.error("EOF in middle of construct") k = rawdata.find('>', i + 1) if k < 0: k = rawdata.find('<', i + 1) @@ -282,13 +218,10 @@ class HTMLParser(_markupbase.ParserBase): if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -367,18 +300,12 @@ class HTMLParser(_markupbase.ParserBase): # Now parse the data between i+1 and j into a tag and attrs attrs = [] - if self.strict: - match = tagfind.match(rawdata, i+1) - else: - match = tagfind_tolerant.match(rawdata, i+1) + match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -401,9 +328,6 @@ class HTMLParser(_markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): @@ -419,10 +343,7 @@ class HTMLParser(_markupbase.ParserBase): # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -435,9 +356,6 @@ class HTMLParser(_markupbase.ParserBase): # buffer boundary return -1 # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") if j > i: return j else: @@ -450,9 +368,6 @@ class HTMLParser(_markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") if j > i: return j else: @@ -472,8 +387,6 @@ class HTMLParser(_markupbase.ParserBase): if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: @@ -539,8 +452,7 @@ class HTMLParser(_markupbase.ParserBase): pass def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) + pass # Internal -- helper to remove special character quoting def unescape(self, s): |