diff options
Diffstat (limited to 'Lib/urlopen.py')
-rwxr-xr-x | Lib/urlopen.py | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/Lib/urlopen.py b/Lib/urlopen.py new file mode 100755 index 0000000..c24b3f9 --- /dev/null +++ b/Lib/urlopen.py @@ -0,0 +1,222 @@ +# Open an arbitrary URL +# +# See the following document for a tentative description of URLs: +# Uniform Resource Locators Tim Berners-Lee +# INTERNET DRAFT CERN +# IETF URL Working Group 14 July 1993 +# draft-ietf-uri-url-01.txt +# +# The object returned by urlopen() will differ per protocol. +# All you know is that is has methods read(), fileno(), close() and info(). +# The read(), fileno() and close() methods work like those of open files. +# The info() method returns an rfc822.Message object which can be +# used to query various info about the object, if available. +# (rfc822.Message objects are queried with the getheader() method.) + +import socket +import regex +import regsub +import string +import rfc822 +import ftplib + + +# External interface -- use urlopen(file) as if it were open(file, 'r') +def urlopen(url): + url = string.strip(url) + if url[:1] == '<' and url[-1:] == '>': url = string.strip(url[1:-1]) + if url[:4] == 'URL:': url = string.strip(url[4:]) + type, url = splittype(url) + if not type: type = 'file' + type = regsub.gsub('-', '_', type) + try: + func = eval('open_' + type) + except NameError: + raise IOError, ('url error', 'unknown url type', type) + try: + return func(url) + except socket.error, msg: + raise IOError, ('socket error', msg) + + +# Each routine of the form open_<type> knows how to open that type of URL + +# Use HTTP protocol +def open_http(url): + import httplib + host, selector = splithost(url) + h = httplib.HTTP(host) + h.putrequest('GET', selector) + errcode, errmsg, headers = h.getreply() + if errcode == 200: return makefile(h.getfile(), headers) + else: raise IOError, ('http error', errcode, errmsg, headers) + +# Empty rfc822.Message object +noheaders = rfc822.Message(open('/dev/null', 'r')) +noheaders.fp.close() # Recycle file descriptor + +# Use Gopher protocol +def open_gopher(url): + import gopherlib + host, selector = splithost(url) + type, selector = splitgophertype(selector) + selector, query = splitquery(selector) + if query: fp = gopherlib.send_query(selector, query, host) + else: fp = gopherlib.send_selector(selector, host) + return makefile(fp, noheaders) + +# Use local file or FTP depending on form of URL +localhost = socket.gethostbyname('localhost') +thishost = socket.gethostbyname(socket.gethostname()) +def open_file(url): + host, file = splithost(url) + if not host: return makefile(open(file, 'r'), noheaders) + host, port = splitport(host) + if not port and socket.gethostbyname(host) in (localhost, thishost): + try: fp = open(file, 'r') + except IOError: fp = None + if fp: return makefile(fp, noheaders) + return open_ftp(url) + +# Use FTP protocol +ftpcache = {} +ftperrors = (ftplib.error_reply, + ftplib.error_temp, + ftplib.error_perm, + ftplib.error_proto) +def open_ftp(url): + host, file = splithost(url) + host, port = splitport(host) + host = socket.gethostbyname(host) + if not port: port = ftplib.FTP_PORT + key = (host, port) + try: + if not ftpcache.has_key(key): + ftpcache[key] = ftpwrapper(host, port) + return makefile(ftpcache[key].retrfile(file), noheaders) + except ftperrors, msg: + raise IOError, ('ftp error', msg) + + +# Utility classes + +# Class used to add an info() method to a file object +class makefile: + def __init__(self, fp, headers): + self.fp = fp + self.headers = headers + self.read = self.fp.read + self.fileno = self.fp.fileno + self.close = self.fp.close + def info(self): + return self.headers + +# Class used by open_ftp() for cache of open FTP connections +class ftpwrapper: + def __init__(self, host, port): + self.host = host + self.port = port + self.init() + def init(self): + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port) + self.ftp.login() + def retrfile(self, file): + try: + self.ftp.voidcmd('TYPE I') + except ftplib.all_errors: + self.init() + self.ftp.voidcmd('TYPE I') + conn = None + if file: + try: + cmd = 'RETR ' + file + conn = self.ftp.transfercmd(cmd) + except ftplib.error_perm, reason: + if reason[:3] != '550': + raise IOError, ('ftp error', reason) + if not conn: + # Try a directory listing + if file: cmd = 'NLST ' + file + else: cmd = 'NLST' + conn = self.ftp.transfercmd(cmd) + return fakefile(self.ftp, conn) + +# Class used by ftpwrapper to handle response when transfer is complete +class fakefile: + def __init__(self, ftp, conn): + self.ftp = ftp + self.conn = conn + self.fp = self.conn.makefile('r') + self.read = self.fp.read + self.fileno = self.fp.fileno + def __del__(self): + self.close() + def close(self): + self.conn = None + self.fp = None + self.read = None + if self.ftp: self.ftp.voidresp() + self.ftp = None + + +# Utilities to split url parts into components: +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitgophertype('/Xselector') --> 'X', 'selector' + +typeprog = regex.compile('^\([^/:]+\):\(.*\)$') +def splittype(url): + if typeprog.match(url) >= 0: return typeprog.group(1, 2) + return None, url + +hostprog = regex.compile('^//\([^/]+\)\(.*\)$') +def splithost(url): + if hostprog.match(url) >= 0: return hostprog.group(1, 2) + return None, url + +portprog = regex.compile('^\(.*\):\([0-9]+\)$') +def splitport(host): + if portprog.match(host) >= 0: return portprog.group(1, 2) + return host, None + +queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') +def splitquery(url): + if queryprog.match(url) >= 0: return queryprog.group(1, 2) + return url, None + +tagprog = regex.compile('^\(.*\)#\([^#]*\)$') +def splittag(url): + if tagprog.match(url) >= 0: return tagprog.group(1, 2) + return url, None + +def splitgophertype(selector): + if selector[:1] == '/' and selector[1:2]: + return selector[1], selector[2:] + return None, selector + + +# Test program +def test(): + import sys + args = sys.argv[1:] + if not args: + args = [ + '/etc/passwd', + 'file:/etc/passwd', + 'file://localhost/etc/passwd', + 'ftp://ftp.cwi.nl/etc/passwd', + 'gopher://gopher.cwi.nl/11/', + 'http://www.cwi.nl/index.html', + ] + for arg in args: + print '-'*10, arg, '-'*10 + print regsub.gsub('\r', '', urlopen(arg).read()) + print '-'*40 + +# Run test program when run as a script +if __name__ == '__main__': + test() |