summaryrefslogtreecommitdiffstats
path: root/Lib/urllib.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1994-03-22 12:05:32 (GMT)
committerGuido van Rossum <guido@python.org>1994-03-22 12:05:32 (GMT)
commit7c6ebb572ea73cc7873f5d8b3277fa43529953f5 (patch)
tree56ae4486e8c64532d4ed73586ca3c950db42cb9d /Lib/urllib.py
parentcc32ac9704128c799170e1cd7bdbfb3a90da43c1 (diff)
downloadcpython-7c6ebb572ea73cc7873f5d8b3277fa43529953f5.zip
cpython-7c6ebb572ea73cc7873f5d8b3277fa43529953f5.tar.gz
cpython-7c6ebb572ea73cc7873f5d8b3277fa43529953f5.tar.bz2
Renamed urlopen.py to urllib.py.
Diffstat (limited to 'Lib/urllib.py')
-rw-r--r--Lib/urllib.py454
1 files changed, 454 insertions, 0 deletions
diff --git a/Lib/urllib.py b/Lib/urllib.py
new file mode 100644
index 0000000..7350de6
--- /dev/null
+++ b/Lib/urllib.py
@@ -0,0 +1,454 @@
+# Open an arbitrary URL
+#
+# See the following document for a tentative description of URLs:
+# Uniform Resource Locators Tim Berners-Lee
+# INTERNET DRAFT CERN
+# IETF URL Working Group 14 July 1993
+# draft-ietf-uri-url-01.txt
+#
+# The object returned by URLopener().open(file) will differ per
+# protocol. All you know is that is has methods read(), readline(),
+# readlines(), fileno(), close() and info(). The read*(), fileno()
+# and close() methods work like those of open files.
+# The info() method returns an rfc822.Message object which can be
+# used to query various info about the object, if available.
+# (rfc822.Message objects are queried with the getheader() method.)
+
+import socket
+import regex
+
+
+# This really consists of two pieces:
+# (1) a class which handles opening of all sorts of URLs
+# (plus assorted utilities etc.)
+# (2) a set of functions for parsing URLs
+# XXX Should these be separated out into different modules?
+
+
+# Shortcut for basic usage
+_urlopener = None
+def urlopen(url):
+ global _urlopener
+ if not _urlopener:
+ _urlopener = URLopener()
+ return _urlopener.open(url)
+def urlretrieve(url):
+ global _urlopener
+ if not _urlopener:
+ _urlopener = URLopener()
+ return _urlopener.retrieve(url)
+def urlcleanup():
+ if _urlopener:
+ _urlopener.cleanup()
+
+
+# Class to open URLs.
+# This is a class rather than just a subroutine because we may need
+# more than one set of global protocol-specific options.
+ftpcache = {}
+class URLopener:
+
+ # Constructor
+ def __init__(self):
+ self.addheaders = []
+ self.tempcache = {}
+ self.ftpcache = ftpcache
+ # Undocumented feature: you can use a different
+ # ftp cache by assigning to the .ftpcache member;
+ # in case you want logically independent URL openers
+
+ def __del__(self):
+ self.close()
+
+ def close(self):
+ self.cleanup()
+
+ def cleanup(self):
+ import os
+ for url in self.tempcache.keys():
+ try:
+ os.unlink(self.tempcache[url][0])
+ except os.error:
+ pass
+ del self.tempcache[url]
+
+ # Add a header to be used by the HTTP interface only
+ # e.g. u.addheader('Accept', 'sound/basic')
+ def addheader(self, *args):
+ self.addheaders.append(args)
+
+ # External interface
+ # Use URLopener().open(file) instead of open(file, 'r')
+ def open(self, url):
+ type, url = splittype(unwrap(url))
+ if not type: type = 'file'
+ name = 'open_' + type
+ if '-' in name:
+ import regsub
+ name = regsub.gsub('-', '_', name)
+ if not hasattr(self, name):
+ raise IOError, ('url error', 'unknown url type', type)
+ try:
+ return getattr(self, name)(url)
+ except socket.error, msg:
+ raise IOError, ('socket error', msg)
+
+ # External interface
+ # retrieve(url) returns (filename, None) for a local object
+ # or (tempfilename, headers) for a remote object
+ def retrieve(self, url):
+ if self.tempcache.has_key(url):
+ return self.tempcache[url]
+ url1 = unwrap(url)
+ if self.tempcache.has_key(url1):
+ self.tempcache[url] = self.tempcache[url1]
+ return self.tempcache[url1]
+ type, url1 = splittype(url1)
+ if not type or type == 'file':
+ try:
+ fp = self.open_local_file(url1)
+ del fp
+ return splithost(url1)[1], None
+ except IOError, msg:
+ pass
+ fp = self.open(url)
+ headers = fp.info()
+ import tempfile
+ tfn = tempfile.mktemp()
+ self.tempcache[url] = result = tfn, headers
+ tfp = open(tfn, 'w')
+ bs = 1024*8
+ block = fp.read(bs)
+ while block:
+ tfp.write(block)
+ block = fp.read(bs)
+ del fp
+ del tfp
+ return result
+
+ # Each method named open_<type> knows how to open that type of URL
+
+ # Use HTTP protocol
+ def open_http(self, url):
+ import httplib
+ host, selector = splithost(url)
+ h = httplib.HTTP(host)
+ h.putrequest('GET', selector)
+ for args in self.addheaders: apply(h.putheader, args)
+ errcode, errmsg, headers = h.getreply()
+ if errcode == 200: return addinfo(h.getfile(), headers)
+ else: raise IOError, ('http error', errcode, errmsg, headers)
+
+ # Use Gopher protocol
+ def open_gopher(self, url):
+ import gopherlib
+ host, selector = splithost(url)
+ type, selector = splitgophertype(selector)
+ selector, query = splitquery(selector)
+ if query: fp = gopherlib.send_query(selector, query, host)
+ else: fp = gopherlib.send_selector(selector, host)
+ return addinfo(fp, noheaders())
+
+ # Use local file or FTP depending on form of URL
+ def open_file(self, url):
+ try:
+ return self.open_local_file(url)
+ except IOError:
+ return self.open_ftp(url)
+
+ # Use local file
+ def open_local_file(self, url):
+ host, file = splithost(url)
+ if not host: return addinfo(open(file, 'r'), noheaders())
+ host, port = splitport(host)
+ if not port and socket.gethostbyname(host) in (
+ localhost(), thishost()):
+ return addinfo(open(file, 'r'), noheaders())
+ raise IOError, ('local file error', 'not on local host')
+
+ # Use FTP protocol
+ def open_ftp(self, url):
+ host, file = splithost(url)
+ if not host: raise IOError, ('ftp error', 'no host given')
+ host, port = splitport(host)
+ host = socket.gethostbyname(host)
+ if not port:
+ import ftplib
+ port = ftplib.FTP_PORT
+ key = (host, port)
+ try:
+ if not self.ftpcache.has_key(key):
+ self.ftpcache[key] = ftpwrapper(host, port)
+ return addinfo(self.ftpcache[key].retrfile(file),
+ noheaders())
+ except ftperrors(), msg:
+ raise IOError, ('ftp error', msg)
+
+
+# Utility functions
+
+# Return the IP address of the magic hostname 'localhost'
+_localhost = None
+def localhost():
+ global _localhost
+ if not _localhost:
+ _localhost = socket.gethostbyname('localhost')
+ return _localhost
+
+# Return the IP address of the current host
+_thishost = None
+def thishost():
+ global _thishost
+ if not _thishost:
+ _thishost = socket.gethostbyname(socket.gethostname())
+ return _thishost
+
+# Return the set of errors raised by the FTP class
+_ftperrors = None
+def ftperrors():
+ global _ftperrors
+ if not _ftperrors:
+ import ftplib
+ _ftperrors = (ftplib.error_reply,
+ ftplib.error_temp,
+ ftplib.error_perm,
+ ftplib.error_proto)
+ return _ftperrors
+
+# Return an empty rfc822.Message object
+_noheaders = None
+def noheaders():
+ global _noheaders
+ if not _noheaders:
+ import rfc822
+ _noheaders = rfc822.Message(open('/dev/null', 'r'))
+ _noheaders.fp.close() # Recycle file descriptor
+ return _noheaders
+
+
+# Utility classes
+
+# Class used by open_ftp() for cache of open FTP connections
+class ftpwrapper:
+ def __init__(self, host, port):
+ self.host = host
+ self.port = port
+ self.init()
+ def init(self):
+ import ftplib
+ self.ftp = ftplib.FTP()
+ self.ftp.connect(self.host, self.port)
+ self.ftp.login()
+ def retrfile(self, file):
+ import ftplib
+ try:
+ self.ftp.voidcmd('TYPE I')
+ except ftplib.all_errors:
+ self.init()
+ self.ftp.voidcmd('TYPE I')
+ conn = None
+ if file:
+ try:
+ cmd = 'RETR ' + file
+ conn = self.ftp.transfercmd(cmd)
+ except ftplib.error_perm, reason:
+ if reason[:3] != '550':
+ raise IOError, ('ftp error', reason)
+ if not conn:
+ # Try a directory listing
+ if file: cmd = 'LIST ' + file
+ else: cmd = 'LIST'
+ conn = self.ftp.transfercmd(cmd)
+ return addclosehook(conn.makefile('r'), self.ftp.voidresp)
+
+# Base class for addinfo and addclosehook
+class addbase:
+ def __init__(self, fp):
+ self.fp = fp
+ self.read = self.fp.read
+ self.readline = self.fp.readline
+ self.readlines = self.fp.readlines
+ self.fileno = self.fp.fileno
+ def __repr__(self):
+ return '<%s at %s whose fp = %s>' % (
+ self.__class__.__name__, `id(self)`, `self.fp`)
+ def __del__(self):
+ self.close()
+ def close(self):
+ self.read = None
+ self.readline = None
+ self.readlines = None
+ self.fileno = None
+ self.fp = None
+
+# Class to add a close hook to an open file
+class addclosehook(addbase):
+ def __init__(self, fp, closehook, *hookargs):
+ addbase.__init__(self, fp)
+ self.closehook = closehook
+ self.hookargs = hookargs
+ def close(self):
+ if self.closehook:
+ apply(self.closehook, self.hookargs)
+ self.closehook = None
+ self.hookargs = None
+ addbase.close(self)
+
+# class to add an info() method to an open file
+class addinfo(addbase):
+ def __init__(self, fp, headers):
+ addbase.__init__(self, fp)
+ self.headers = headers
+ def info(self):
+ return self.headers
+
+
+# Utility to combine a URL with a base URL to form a new URL
+
+def basejoin(base, url):
+ type, path = splittype(url)
+ if type: return url
+ host, path = splithost(path)
+ basetype, basepath = splittype(base)
+ basehost, basepath = splithost(basepath)
+ basepath, basetag = splittag(basepath)
+ basepath, basequery = splitquery(basepath)
+ type = basetype or 'file'
+ if path[:1] != '/':
+ import string
+ i = string.rfind(basepath, '/')
+ if i < 0: basepath = '/'
+ else: basepath = basepath[:i+1]
+ path = basepath + path
+ if not host: host = basehost
+ if host: return type + '://' + host + path
+ else: return type + ':' + path
+
+
+# Utilities to parse URLs:
+# unwrap('<URL:type//host/path>') --> 'type//host/path'
+# splittype('type:opaquestring') --> 'type', 'opaquestring'
+# splithost('//host[:port]/path') --> 'host[:port]', '/path'
+# splitport('host:port') --> 'host', 'port'
+# splitquery('/path?query') --> '/path', 'query'
+# splittag('/path#tag') --> '/path', 'tag'
+# splitgophertype('/Xselector') --> 'X', 'selector'
+# unquote('abc%20def') -> 'abc def'
+# quote('abc def') -> 'abc%20def')
+
+def unwrap(url):
+ import string
+ url = string.strip(url)
+ if url[:1] == '<' and url[-1:] == '>':
+ url = string.strip(url[1:-1])
+ if url[:4] == 'URL:': url = string.strip(url[4:])
+ return url
+
+_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
+def splittype(url):
+ if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
+ return None, url
+
+_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
+def splithost(url):
+ if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
+ return None, url
+
+_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
+def splitport(host):
+ if _portprog.match(host) >= 0: return _portprog.group(1, 2)
+ return host, None
+
+_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
+def splitquery(url):
+ if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
+ return url, None
+
+_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
+def splittag(url):
+ if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
+ return url, None
+
+def splitgophertype(selector):
+ if selector[:1] == '/' and selector[1:2]:
+ return selector[1], selector[2:]
+ return None, selector
+
+_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
+def unquote(s):
+ import string
+ i = 0
+ n = len(s)
+ res = ''
+ while 0 <= i < n:
+ j = _quoteprog.search(s, i)
+ if j < 0:
+ res = res + s[i:]
+ break
+ res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
+ i = j+3
+ return res
+
+_acceptable = \
+ 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._@'
+def quote(s):
+ res = ''
+ for c in s:
+ if c in _acceptable: res = res + c
+ else: res = res + '%%%02x' % ord(c)
+ return res
+
+# Test and time quote() and unquote()
+def test1():
+ import time
+ s = ''
+ for i in range(256): s = s + chr(i)
+ s = s*4
+ t0 = time.time()
+ qs = quote(s)
+ uqs = unquote(qs)
+ t1 = time.time()
+ if uqs != s:
+ print 'Wrong!'
+ print `s`
+ print `qs`
+ print `uqs`
+ print round(t1 - t0, 3), 'sec'
+
+
+# Test program
+def test():
+ import sys
+ import regsub
+ args = sys.argv[1:]
+ if not args:
+ args = [
+ '/etc/passwd',
+ 'file:/etc/passwd',
+ 'file://localhost/etc/passwd',
+ 'ftp://ftp.cwi.nl/etc/passwd',
+ 'gopher://gopher.cwi.nl/11/',
+ 'http://www.cwi.nl/index.html',
+ ]
+ try:
+ for url in args:
+ print '-'*10, url, '-'*10
+ fn, h = urlretrieve(url)
+ print fn, h
+ if h:
+ print '======'
+ for k in h.keys(): print k + ':', h[k]
+ print '======'
+ fp = open(fn, 'r')
+ data = fp.read()
+ del fp
+ print regsub.gsub('\r', '', data)
+ fn, h = None, None
+ print '-'*40
+ finally:
+ urlcleanup()
+
+# Run test program when run as a script
+if __name__ == '__main__':
+ test1()
+ test()