From 48766512a0b438b66e97dfdfcb933cd104baeffe Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 28 Mar 1996 18:45:04 +0000 Subject: Reformatted with 4-space tab stops. Allow '=' and '~' in unquoted attribute values. Added overridable methods handle_starttag(tag, method, attrs) and handle_endtag(tag, method) so subclasses can decide whether they really want to call the method (e.g. when suppressing some portion of the document). Added support for a number of SGML shortcuts: shorthand full notation ...<>... ...... ... ... ... This required factoring out some common actions and rationalizing the interface to parse_endtag(), so as to make the code more readable. Fixed syntax for &entity and &#char references so the trailing semicolon is optional; removed explicit support for trailing period (which was a TBL mistake in HTML 0.0). Generalized the test program. Tried to speed things up a little. (More to come after the profile results are in.) Fix error recovery: call the end methods popped from the stack instead of the one that triggers. (Plus some complications because of the way HTML extensions are handled in Grail.) --- Lib/sgmllib.py | 692 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 406 insertions(+), 286 deletions(-) diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index b46f829..304bbdb 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -14,16 +14,28 @@ import string # Regular expressions used for parsing -incomplete = regex.compile( - '') +interesting = regex.compile('[&<]') +incomplete = regex.compile('&\([a-zA-Z][a-zA-Z0-9]*\|#[0-9]*\)?\|' + '<\([a-zA-Z][^<>]*\|' + '/\([a-zA-Z][^<>]*\)?\|' + '![^<>]*\)?') + +entityref = regex.compile('&\([a-zA-Z][a-zA-Z0-9]*\)[^a-zA-Z0-9]') +charref = regex.compile('&#\([0-9]+\)[^0-9]') + +starttagopen = regex.compile('<[>a-zA-Z]') +shorttagopen = regex.compile('<[a-zA-Z][a-zA-Z0-9]*/') +shorttag = regex.compile('<\([a-zA-Z][a-zA-Z0-9]*\)/\([^/]*\)/') +endtagopen = regex.compile('a-zA-Z]') +endbracket = regex.compile('[<>]') special = regex.compile(']*>') commentopen = regex.compile('