1 files changed, 55 insertions, 44 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 3e85a91..3020d11 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -29,11 +29,16 @@ starttagopen = re.compile('<[>a-zA-Z]')
 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piclose = re.compile('>')
-endbracket = re.compile('[<>]')
+starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
+        r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+        r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
+        r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
+    r')*\s*/?\s*(?=[<>])')
+endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
 
 
 class SGMLParseError(RuntimeError):
@@ -53,6 +58,10 @@ class SGMLParseError(RuntimeError):
 # self.handle_entityref() with the entity reference as argument.
 
 class SGMLParser(markupbase.ParserBase):
+    # Definition of entities -- derived classes may override
+    entity_or_charref = re.compile('&(?:'
+      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+      ')(;?)')
 
     def __init__(self, verbose=0):
         """Initialize and reset this instance."""
@@ -245,11 +254,10 @@ class SGMLParser(markupbase.ParserBase):
             self.finish_shorttag(tag, data)
             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
             return k
-        # XXX The following should skip matching quotes (' or ")
-        match = endbracket.search(rawdata, i+1)
+        match = starttag.match(rawdata, i)
         if not match:
             return -1
-        j = match.start(0)
+        j = match.end(0)
         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         if rawdata[i:i+2] == '<>':
@@ -274,32 +282,8 @@ class SGMLParser(markupbase.ParserBase):
                     attrvalue[:1] == '"' == attrvalue[-1:]):
                     # strip quotes
                     attrvalue = attrvalue[1:-1]
-                l = 0
-                new_attrvalue = ''
-                while l < len(attrvalue):
-                    av_match = entityref.match(attrvalue, l)
-                    if (av_match and av_match.group(1) in self.entitydefs and
-                        attrvalue[av_match.end(1)] == ';'):
-                        # only substitute entityrefs ending in ';' since
-                        # otherwise we may break <a href='?p=x&q=y'>
-                        # which is very common
-                        new_attrvalue += self.entitydefs[av_match.group(1)]
-                        l = av_match.end(0)
-                        continue
-                    ch_match = charref.match(attrvalue, l)
-                    if ch_match:
-                        try:
-                            char = chr(int(ch_match.group(1)))
-                            new_attrvalue += char
-                            l = ch_match.end(0)
-                            continue
-                        except ValueError:
-                            # invalid character reference, don't substitute
-                            pass
-                    # all other cases
-                    new_attrvalue += attrvalue[l]
-                    l += 1
-                attrvalue = new_attrvalue
+                attrvalue = self.entity_or_charref.sub(
+                    self._convert_ref, attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = match.end(0)
         if rawdata[j] == '>':
@@ -308,13 +292,24 @@ class SGMLParser(markupbase.ParserBase):
         self.finish_starttag(tag, attrs)
         return j
 
+    # Internal -- convert entity or character reference
+    def _convert_ref(self, match):
+        if match.group(2):
+            return self.convert_charref(match.group(2)) or \
+                '&#%s%s' % match.groups()[1:]
+        elif match.group(3):
+            return self.convert_entityref(match.group(1)) or \
+                '&%s;' % match.group(1)
+        else:
+            return '&%s' % match.group(1)
+
     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
-        match = endbracket.search(rawdata, i+1)
+        match = endtag.match(rawdata, i)
         if not match:
             return -1
-        j = match.start(0)
+        j = match.end(0)
         tag = rawdata[i+2:j].strip().lower()
         if rawdata[j] == '>':
             j = j+1
@@ -391,35 +386,51 @@ class SGMLParser(markupbase.ParserBase):
             print '*** Unbalanced </' + tag + '>'
             print '*** Stack:', self.stack
 
-    def handle_charref(self, name):
-        """Handle character reference, no need to override."""
+    def convert_charref(self, name):
+        """Convert character reference, may be overridden."""
         try:
             n = int(name)
         except ValueError:
-            self.unknown_charref(name)
             return
         if not 0 <= n <= 255:
-            self.unknown_charref(name)
             return
-        self.handle_data(chr(n))
+        return self.convert_codepoint(n)
+
+    def convert_codepoint(self, codepoint):
+        return chr(codepoint)
+
+    def handle_charref(self, name):
+        """Handle character reference, no need to override."""
+        replacement = self.convert_charref(name)
+        if replacement is None:
+            self.unknown_charref(name)
+        else:
+            self.handle_data(replacement)
 
     # Definition of entities -- derived classes may override
     entitydefs = \
             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 
-    def handle_entityref(self, name):
-        """Handle entity references.
+    def convert_entityref(self, name):
+        """Convert entity references.
 
-        There should be no need to override this method; it can be
-        tailored by setting up the self.entitydefs mapping appropriately.
+        As an alternative to overriding this method; one can tailor the
+        results by setting up the self.entitydefs mapping appropriately.
         """
         table = self.entitydefs
         if name in table:
-            self.handle_data(table[name])
+            return table[name]
         else:
-            self.unknown_entityref(name)
             return
 
+    def handle_entityref(self, name):
+        """Handle entity references, no need to override."""
+        replacement = self.convert_entityref(name)
+        if replacement is None:
+            self.unknown_entityref(name)
+        else:
+            self.handle_data(self.convert_entityref(name))
+
     # Example -- handle data, should be overridden
     def handle_data(self, data):
         pass