diff options
Diffstat (limited to 'Lib/urllib')
-rw-r--r-- | Lib/urllib/parse.py | 70 | ||||
-rw-r--r-- | Lib/urllib/request.py | 60 | ||||
-rw-r--r-- | Lib/urllib/robotparser.py | 39 |
3 files changed, 163 insertions, 6 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 4d7fcec..99a6977 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -156,9 +156,8 @@ class _NetlocResultMixinBase(object): port = self._hostinfo[1] if port is not None: port = int(port, 10) - # Return None on an illegal port if not ( 0 <= port <= 65535): - return None + raise ValueError("Port out of range 0-65535") return port @@ -225,8 +224,71 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): from collections import namedtuple _DefragResultBase = namedtuple('DefragResult', 'url fragment') -_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment') -_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment') +_SplitResultBase = namedtuple( + 'SplitResult', 'scheme netloc path query fragment') +_ParseResultBase = namedtuple( + 'ParseResult', 'scheme netloc path params query fragment') + +_DefragResultBase.__doc__ = """ +DefragResult(url, fragment) + +A 2-tuple that contains the url without fragment identifier and the fragment +identifier as a separate argument. +""" + +_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" + +_DefragResultBase.fragment.__doc__ = """ +Fragment identifier separated from URL, that allows indirect identification of a +secondary resource by reference to a primary resource and additional identifying +information. +""" + +_SplitResultBase.__doc__ = """ +SplitResult(scheme, netloc, path, query, fragment) + +A 5-tuple that contains the different components of a URL. Similar to +ParseResult, but does not split params. +""" + +_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" + +_SplitResultBase.netloc.__doc__ = """ +Network location where the request is made to. +""" + +_SplitResultBase.path.__doc__ = """ +The hierarchical path, such as the path to a file to download. +""" + +_SplitResultBase.query.__doc__ = """ +The query component, that contains non-hierarchical data, that along with data +in path component, identifies a resource in the scope of URI's scheme and +network location. +""" + +_SplitResultBase.fragment.__doc__ = """ +Fragment identifier, that allows indirect identification of a secondary resource +by reference to a primary resource and additional identifying information. +""" + +_ParseResultBase.__doc__ = """ +ParseResult(scheme, netloc, path, params, query, fragment) + +A 6-tuple that contains components of a parsed URL. +""" + +_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ +_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ +_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ +_ParseResultBase.params.__doc__ = """ +Parameters for last path element used to dereference the URI in order to provide +access to perform some operation on the resource. +""" + +_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ +_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ + # For backwards compatibility, alias _NetlocResultMixinStr # ResultBase is no longer part of the documented API, but it is diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 4c16518..4c2b9fe 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -138,6 +138,66 @@ __version__ = sys.version[:3] _opener = None def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, *, cafile=None, capath=None, cadefault=False, context=None): + '''Open the URL url, which can be either a string or a Request object. + + *data* must be a bytes object specifying additional data to be sent to the + server, or None if no such data is needed. data may also be an iterable + object and in that case Content-Length value must be specified in the + headers. Currently HTTP requests are the only ones that use data; the HTTP + request will be a POST instead of a GET when the data parameter is + provided. + + *data* should be a buffer in the standard application/x-www-form-urlencoded + format. The urllib.parse.urlencode() function takes a mapping or sequence + of 2-tuples and returns an ASCII text string in this format. It should be + encoded to bytes before being used as the data parameter. + + urllib.request module uses HTTP/1.1 and includes a "Connection:close" + header in its HTTP requests. + + The optional *timeout* parameter specifies a timeout in seconds for + blocking operations like the connection attempt (if not specified, the + global default timeout setting will be used). This only works for HTTP, + HTTPS and FTP connections. + + If *context* is specified, it must be a ssl.SSLContext instance describing + the various SSL options. See HTTPSConnection for more details. + + The optional *cafile* and *capath* parameters specify a set of trusted CA + certificates for HTTPS requests. cafile should point to a single file + containing a bundle of CA certificates, whereas capath should point to a + directory of hashed certificate files. More information can be found in + ssl.SSLContext.load_verify_locations(). + + The *cadefault* parameter is ignored. + + For http and https urls, this function returns a http.client.HTTPResponse + object which has the following HTTPResponse Objects methods. + + For ftp, file, and data urls and requests explicitly handled by legacy + URLopener and FancyURLopener classes, this function returns a + urllib.response.addinfourl object which can work as context manager and has + methods such as: + + * geturl() - return the URL of the resource retrieved, commonly used to + determine if a redirect was followed + + * info() - return the meta-information of the page, such as headers, in the + form of an email.message_from_string() instance (see Quick Reference to + HTTP Headers) + + * getcode() - return the HTTP status code of the response. Raises URLError + on errors. + + Note that *None& may be returned if no handler handles the request (though + the default installed global OpenerDirector uses UnknownHandler to ensure + this never happens). + + In addition, if proxy settings are detected (for example, when a *_proxy + environment variable like http_proxy is set), ProxyHandler is default + installed and makes sure the requests are handled through the proxy. + + ''' global _opener if cafile or capath or cadefault: if context is not None: diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 4fbb0cb..4ac553a 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -10,7 +10,9 @@ http://www.robotstxt.org/norobots-rfc.txt """ -import urllib.parse, urllib.request +import collections +import urllib.parse +import urllib.request __all__ = ["RobotFileParser"] @@ -120,10 +122,29 @@ class RobotFileParser: if state != 0: entry.rulelines.append(RuleLine(line[1], True)) state = 2 + elif line[0] == "crawl-delay": + if state != 0: + # before trying to convert to int we need to make + # sure that robots.txt has valid syntax otherwise + # it will crash + if line[1].strip().isdigit(): + entry.delay = int(line[1]) + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + # check if all values are sane + if (len(numbers) == 2 and numbers[0].strip().isdigit() + and numbers[1].strip().isdigit()): + req_rate = collections.namedtuple('req_rate', + 'requests seconds') + entry.req_rate = req_rate + entry.req_rate.requests = int(numbers[0]) + entry.req_rate.seconds = int(numbers[1]) + state = 2 if state == 2: self._add_entry(entry) - def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" if self.disallow_all: @@ -153,6 +174,18 @@ class RobotFileParser: # agent not found ==> access granted return True + def crawl_delay(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + return entry.delay + return None + + def request_rate(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + return entry.req_rate + return None + def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -180,6 +213,8 @@ class Entry: def __init__(self): self.useragents = [] self.rulelines = [] + self.delay = None + self.req_rate = None def __str__(self): ret = [] |