Change some comments into docstrings.

Fix handling of hexadecimal character references (legal in XHTML) so that they are properly interpreted as character references. This fixes SF bug #445196.
author: Fred Drake <fdrake@acm.org> 2001-08-03 19:50:59 (GMT)
committer: Fred Drake <fdrake@acm.org> 2001-08-03 19:50:59 (GMT)
commit: 1d4601d30677ca8ac0fa70a3b57ea54d153ede4a (patch)
tree: b184d3a39a235c21cf55fbb5401d2f88af426845 /Lib
parent: a0a7706916b018630973a2b9b2f118048429882a (diff)
download: cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.zip
cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.tar.gz
cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.tar.bz2
1 files changed, 31 insertions, 27 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 339c132..39a5d82 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -1,4 +1,4 @@
-"""A parser for HTML."""
+"""A parser for HTML and XHTML."""
 
 # This file is based on sgmllib.py, but the API is slightly different.
 
@@ -18,7 +18,7 @@ interesting_cdata = re.compile(r'<(/|\Z)')
 incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
-charref = re.compile('&#([0-9]+)[^0-9]')
+charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
 
 starttagopen = re.compile('<[a-zA-Z]')
 piopen = re.compile(r'<\?')
@@ -73,32 +73,35 @@ class HTMLParseError(Exception):
         return result
 
 
-# HTML parser class -- find tags and call handler functions.
-# Usage:
-#
-#     p = HTMLParser(); p.feed(data); ...; p.close()
-
-# Start tags are handled by calling self.handle_starttag() or
-# self.handle_startendtag(); end tags by self.handle_endtag().  The
-# data between tags is passed from the parser to the derived class by
-# calling self.handle_data() with the data as argument (the data may
-# be split up in arbitrary chunks).  Entity references are passed by
-# calling self.handle_entityref() with the entity reference as the
-# argument.  Numeric character references are passed to
-# self.handle_charref() with the string containing the reference as
-# the argument.
-
 class HTMLParser:
+    """Find tags and other markup and call handler functions.
+
+    Usage:
+        p = HTMLParser()
+        p.feed(data)
+        ...
+        p.close()
+
+    Start tags are handled by calling self.handle_starttag() or
+    self.handle_startendtag(); end tags by self.handle_endtag().  The
+    data between tags is passed from the parser to the derived class
+    by calling self.handle_data() with the data as argument (the data
+    may be split up in arbitrary chunks).  Entity references are
+    passed by calling self.handle_entityref() with the entity
+    reference as the argument.  Numeric character references are
+    passed to self.handle_charref() with the string containing the
+    reference as the argument.
+    """
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
 
 
-    # Interface -- initialize and reset this instance
     def __init__(self):
+        """Initialize and reset this instance."""
         self.reset()
 
-    # Interface -- reset this instance.  Loses all unprocessed data
     def reset(self):
+        """Reset this instance.  Loses all unprocessed data."""
         self.rawdata = ''
         self.stack = []
         self.lasttag = '???'
@@ -106,16 +109,17 @@ class HTMLParser:
         self.offset = 0
         self.interesting = interesting_normal
 
-    # Interface -- feed some data to the parser.  Call this as
-    # often as you want, with as little or as much text as you
-    # want (may include '\n').  (This just saves the text, all the
-    # processing is done by goahead().)
     def feed(self, data):
+        """Feed data to the parser.
+
+        Call this as often as you want, with as little or as much text
+        as you want (may include '\n').
+        """
         self.rawdata = self.rawdata + data
         self.goahead(0)
 
-    # Interface -- handle the remaining data
     def close(self):
+        """Handle any buffered data."""
         self.goahead(1)
 
     # Internal -- update line number and offset.  This should be
@@ -135,14 +139,14 @@ class HTMLParser:
             self.offset = self.offset + j-i
         return j
 
-    # Interface -- return current line number and offset.
     def getpos(self):
+        """Return current line number and offset."""
         return self.lineno, self.offset
 
     __starttag_text = None
 
-    # Interface -- return full source of start tag: "<...>"
     def get_starttag_text(self):
+        """Return full source of start tag: '<...>'."""
         return self.__starttag_text
 
     def set_cdata_mode(self):
@@ -195,7 +199,7 @@ class HTMLParser:
             elif rawdata[i] == '&':
                 match = charref.match(rawdata, i)
                 if match:
-                    name = match.group(1)
+                    name = match.group()[2:-1]
                     self.handle_charref(name)
                     k = match.end()
                     if rawdata[k-1] != ';':
author	Fred Drake <fdrake@acm.org>	2001-08-03 19:50:59 (GMT)
committer	Fred Drake <fdrake@acm.org>	2001-08-03 19:50:59 (GMT)
commit	1d4601d30677ca8ac0fa70a3b57ea54d153ede4a (patch)
tree	b184d3a39a235c21cf55fbb5401d2f88af426845 /Lib
parent	a0a7706916b018630973a2b9b2f118048429882a (diff)
download	cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.zip cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.tar.gz cpython-1d4601d30677ca8ac0fa70a3b57ea54d153ede4a.tar.bz2