3 files changed, 163 insertions, 6 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 4d7fcec..99a6977 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -156,9 +156,8 @@ class _NetlocResultMixinBase(object):
         port = self._hostinfo[1]
         if port is not None:
             port = int(port, 10)
-            # Return None on an illegal port
             if not ( 0 <= port <= 65535):
-                return None
+                raise ValueError("Port out of range 0-65535")
         return port
 
 
@@ -225,8 +224,71 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
 from collections import namedtuple
 
 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
-_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
-_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
+_SplitResultBase = namedtuple(
+    'SplitResult', 'scheme netloc path query fragment')
+_ParseResultBase = namedtuple(
+    'ParseResult', 'scheme netloc path params query fragment')
+
+_DefragResultBase.__doc__ = """
+DefragResult(url, fragment)
+
+A 2-tuple that contains the url without fragment identifier and the fragment
+identifier as a separate argument.
+"""
+
+_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
+
+_DefragResultBase.fragment.__doc__ = """
+Fragment identifier separated from URL, that allows indirect identification of a
+secondary resource by reference to a primary resource and additional identifying
+information.
+"""
+
+_SplitResultBase.__doc__ = """
+SplitResult(scheme, netloc, path, query, fragment)
+
+A 5-tuple that contains the different components of a URL. Similar to
+ParseResult, but does not split params.
+"""
+
+_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
+
+_SplitResultBase.netloc.__doc__ = """
+Network location where the request is made to.
+"""
+
+_SplitResultBase.path.__doc__ = """
+The hierarchical path, such as the path to a file to download.
+"""
+
+_SplitResultBase.query.__doc__ = """
+The query component, that contains non-hierarchical data, that along with data
+in path component, identifies a resource in the scope of URI's scheme and
+network location.
+"""
+
+_SplitResultBase.fragment.__doc__ = """
+Fragment identifier, that allows indirect identification of a secondary resource
+by reference to a primary resource and additional identifying information.
+"""
+
+_ParseResultBase.__doc__ = """
+ParseResult(scheme, netloc, path, params,  query, fragment)
+
+A 6-tuple that contains components of a parsed URL.
+"""
+
+_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
+_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
+_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
+_ParseResultBase.params.__doc__ = """
+Parameters for last path element used to dereference the URI in order to provide
+access to perform some operation on the resource.
+"""
+
+_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
+_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
+
 
 # For backwards compatibility, alias _NetlocResultMixinStr
 # ResultBase is no longer part of the documented API, but it is
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 4c16518..4c2b9fe 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -138,6 +138,66 @@ __version__ = sys.version[:3]
 _opener = None
 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
             *, cafile=None, capath=None, cadefault=False, context=None):
+    '''Open the URL url, which can be either a string or a Request object.
+
+    *data* must be a bytes object specifying additional data to be sent to the
+    server, or None if no such data is needed. data may also be an iterable
+    object and in that case Content-Length value must be specified in the
+    headers. Currently HTTP requests are the only ones that use data; the HTTP
+    request will be a POST instead of a GET when the data parameter is
+    provided.
+
+    *data* should be a buffer in the standard application/x-www-form-urlencoded
+    format. The urllib.parse.urlencode() function takes a mapping or sequence
+    of 2-tuples and returns an ASCII text string in this format. It should be
+    encoded to bytes before being used as the data parameter.
+
+    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
+    header in its HTTP requests.
+
+    The optional *timeout* parameter specifies a timeout in seconds for
+    blocking operations like the connection attempt (if not specified, the
+    global default timeout setting will be used). This only works for HTTP,
+    HTTPS and FTP connections.
+
+    If *context* is specified, it must be a ssl.SSLContext instance describing
+    the various SSL options. See HTTPSConnection for more details.
+
+    The optional *cafile* and *capath* parameters specify a set of trusted CA
+    certificates for HTTPS requests. cafile should point to a single file
+    containing a bundle of CA certificates, whereas capath should point to a
+    directory of hashed certificate files. More information can be found in
+    ssl.SSLContext.load_verify_locations().
+
+    The *cadefault* parameter is ignored.
+
+    For http and https urls, this function returns a http.client.HTTPResponse
+    object which has the following HTTPResponse Objects methods.
+
+    For ftp, file, and data urls and requests explicitly handled by legacy
+    URLopener and FancyURLopener classes, this function returns a
+    urllib.response.addinfourl object which can work as context manager and has
+    methods such as:
+
+    * geturl() - return the URL of the resource retrieved, commonly used to
+      determine if a redirect was followed
+
+    * info() - return the meta-information of the page, such as headers, in the
+      form of an email.message_from_string() instance (see Quick Reference to
+      HTTP Headers)
+
+    * getcode() - return the HTTP status code of the response.  Raises URLError
+      on errors.
+
+    Note that *None& may be returned if no handler handles the request (though
+    the default installed global OpenerDirector uses UnknownHandler to ensure
+    this never happens).
+
+    In addition, if proxy settings are detected (for example, when a *_proxy
+    environment variable like http_proxy is set), ProxyHandler is default
+    installed and makes sure the requests are handled through the proxy.
+
+    '''
     global _opener
     if cafile or capath or cadefault:
         if context is not None:
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 4fbb0cb..4ac553a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
     http://www.robotstxt.org/norobots-rfc.txt
 """
 
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
 
 __all__ = ["RobotFileParser"]
 
@@ -120,10 +122,29 @@ class RobotFileParser:
                     if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
                         state = 2
+                elif line[0] == "crawl-delay":
+                    if state != 0:
+                        # before trying to convert to int we need to make
+                        # sure that robots.txt has valid syntax otherwise
+                        # it will crash
+                        if line[1].strip().isdigit():
+                            entry.delay = int(line[1])
+                        state = 2
+                elif line[0] == "request-rate":
+                    if state != 0:
+                        numbers = line[1].split('/')
+                        # check if all values are sane
+                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
+                            and numbers[1].strip().isdigit()):
+                            req_rate = collections.namedtuple('req_rate',
+                                                              'requests seconds')
+                            entry.req_rate = req_rate
+                            entry.req_rate.requests = int(numbers[0])
+                            entry.req_rate.seconds = int(numbers[1])
+                        state = 2
         if state == 2:
             self._add_entry(entry)
 
-
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
         if self.disallow_all:
@@ -153,6 +174,18 @@ class RobotFileParser:
         # agent not found ==> access granted
         return True
 
+    def crawl_delay(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.delay
+        return None
+
+    def request_rate(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.req_rate
+        return None
+
     def __str__(self):
         return ''.join([str(entry) + "\n" for entry in self.entries])
 
@@ -180,6 +213,8 @@ class Entry:
     def __init__(self):
         self.useragents = []
         self.rulelines = []
+        self.delay = None
+        self.req_rate = None
 
     def __str__(self):
         ret = []