summaryrefslogtreecommitdiffstats
path: root/Lib/urllib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/urllib')
-rw-r--r--Lib/urllib/parse.py70
-rw-r--r--Lib/urllib/request.py60
-rw-r--r--Lib/urllib/robotparser.py39
3 files changed, 163 insertions, 6 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 4d7fcec..99a6977 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -156,9 +156,8 @@ class _NetlocResultMixinBase(object):
port = self._hostinfo[1]
if port is not None:
port = int(port, 10)
- # Return None on an illegal port
if not ( 0 <= port <= 65535):
- return None
+ raise ValueError("Port out of range 0-65535")
return port
@@ -225,8 +224,71 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
from collections import namedtuple
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
-_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
-_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
+_SplitResultBase = namedtuple(
+ 'SplitResult', 'scheme netloc path query fragment')
+_ParseResultBase = namedtuple(
+ 'ParseResult', 'scheme netloc path params query fragment')
+
+_DefragResultBase.__doc__ = """
+DefragResult(url, fragment)
+
+A 2-tuple that contains the url without fragment identifier and the fragment
+identifier as a separate argument.
+"""
+
+_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
+
+_DefragResultBase.fragment.__doc__ = """
+Fragment identifier separated from URL, that allows indirect identification of a
+secondary resource by reference to a primary resource and additional identifying
+information.
+"""
+
+_SplitResultBase.__doc__ = """
+SplitResult(scheme, netloc, path, query, fragment)
+
+A 5-tuple that contains the different components of a URL. Similar to
+ParseResult, but does not split params.
+"""
+
+_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
+
+_SplitResultBase.netloc.__doc__ = """
+Network location where the request is made to.
+"""
+
+_SplitResultBase.path.__doc__ = """
+The hierarchical path, such as the path to a file to download.
+"""
+
+_SplitResultBase.query.__doc__ = """
+The query component, that contains non-hierarchical data, that along with data
+in path component, identifies a resource in the scope of URI's scheme and
+network location.
+"""
+
+_SplitResultBase.fragment.__doc__ = """
+Fragment identifier, that allows indirect identification of a secondary resource
+by reference to a primary resource and additional identifying information.
+"""
+
+_ParseResultBase.__doc__ = """
+ParseResult(scheme, netloc, path, params, query, fragment)
+
+A 6-tuple that contains components of a parsed URL.
+"""
+
+_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
+_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
+_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
+_ParseResultBase.params.__doc__ = """
+Parameters for last path element used to dereference the URI in order to provide
+access to perform some operation on the resource.
+"""
+
+_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
+_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
+
# For backwards compatibility, alias _NetlocResultMixinStr
# ResultBase is no longer part of the documented API, but it is
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 4c16518..4c2b9fe 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -138,6 +138,66 @@ __version__ = sys.version[:3]
_opener = None
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
*, cafile=None, capath=None, cadefault=False, context=None):
+ '''Open the URL url, which can be either a string or a Request object.
+
+ *data* must be a bytes object specifying additional data to be sent to the
+ server, or None if no such data is needed. data may also be an iterable
+ object and in that case Content-Length value must be specified in the
+ headers. Currently HTTP requests are the only ones that use data; the HTTP
+ request will be a POST instead of a GET when the data parameter is
+ provided.
+
+ *data* should be a buffer in the standard application/x-www-form-urlencoded
+ format. The urllib.parse.urlencode() function takes a mapping or sequence
+ of 2-tuples and returns an ASCII text string in this format. It should be
+ encoded to bytes before being used as the data parameter.
+
+ urllib.request module uses HTTP/1.1 and includes a "Connection:close"
+ header in its HTTP requests.
+
+ The optional *timeout* parameter specifies a timeout in seconds for
+ blocking operations like the connection attempt (if not specified, the
+ global default timeout setting will be used). This only works for HTTP,
+ HTTPS and FTP connections.
+
+ If *context* is specified, it must be a ssl.SSLContext instance describing
+ the various SSL options. See HTTPSConnection for more details.
+
+ The optional *cafile* and *capath* parameters specify a set of trusted CA
+ certificates for HTTPS requests. cafile should point to a single file
+ containing a bundle of CA certificates, whereas capath should point to a
+ directory of hashed certificate files. More information can be found in
+ ssl.SSLContext.load_verify_locations().
+
+ The *cadefault* parameter is ignored.
+
+ For http and https urls, this function returns a http.client.HTTPResponse
+ object which has the following HTTPResponse Objects methods.
+
+ For ftp, file, and data urls and requests explicitly handled by legacy
+ URLopener and FancyURLopener classes, this function returns a
+ urllib.response.addinfourl object which can work as context manager and has
+ methods such as:
+
+ * geturl() - return the URL of the resource retrieved, commonly used to
+ determine if a redirect was followed
+
+ * info() - return the meta-information of the page, such as headers, in the
+ form of an email.message_from_string() instance (see Quick Reference to
+ HTTP Headers)
+
+ * getcode() - return the HTTP status code of the response. Raises URLError
+ on errors.
+
+ Note that *None& may be returned if no handler handles the request (though
+ the default installed global OpenerDirector uses UnknownHandler to ensure
+ this never happens).
+
+ In addition, if proxy settings are detected (for example, when a *_proxy
+ environment variable like http_proxy is set), ProxyHandler is default
+ installed and makes sure the requests are handled through the proxy.
+
+ '''
global _opener
if cafile or capath or cadefault:
if context is not None:
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 4fbb0cb..4ac553a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt
"""
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
__all__ = ["RobotFileParser"]
@@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
+ elif line[0] == "crawl-delay":
+ if state != 0:
+ # before trying to convert to int we need to make
+ # sure that robots.txt has valid syntax otherwise
+ # it will crash
+ if line[1].strip().isdigit():
+ entry.delay = int(line[1])
+ state = 2
+ elif line[0] == "request-rate":
+ if state != 0:
+ numbers = line[1].split('/')
+ # check if all values are sane
+ if (len(numbers) == 2 and numbers[0].strip().isdigit()
+ and numbers[1].strip().isdigit()):
+ req_rate = collections.namedtuple('req_rate',
+ 'requests seconds')
+ entry.req_rate = req_rate
+ entry.req_rate.requests = int(numbers[0])
+ entry.req_rate.seconds = int(numbers[1])
+ state = 2
if state == 2:
self._add_entry(entry)
-
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@@ -153,6 +174,18 @@ class RobotFileParser:
# agent not found ==> access granted
return True
+ def crawl_delay(self, useragent):
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.delay
+ return None
+
+ def request_rate(self, useragent):
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.req_rate
+ return None
+
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
@@ -180,6 +213,8 @@ class Entry:
def __init__(self):
self.useragents = []
self.rulelines = []
+ self.delay = None
+ self.req_rate = None
def __str__(self):
ret = []