summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xLib/urlopen.py410
1 files changed, 0 insertions, 410 deletions
diff --git a/Lib/urlopen.py b/Lib/urlopen.py
deleted file mode 100755
index c43b7f4..0000000
--- a/Lib/urlopen.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Open an arbitrary URL
-#
-# See the following document for a tentative description of URLs:
-# Uniform Resource Locators Tim Berners-Lee
-# INTERNET DRAFT CERN
-# IETF URL Working Group 14 July 1993
-# draft-ietf-uri-url-01.txt
-#
-# The object returned by URLopener().open(file) will differ per
-# protocol. All you know is that is has methods read(), readline(),
-# readlines(), fileno(), close() and info(). The read*(), fileno()
-# and close() methods work like those of open files.
-# The info() method returns an rfc822.Message object which can be
-# used to query various info about the object, if available.
-# (rfc822.Message objects are queried with the getheader() method.)
-
-import socket
-import regex
-
-
-# This really consists of two pieces:
-# (1) a class which handles opening of all sorts of URLs
-# (plus assorted utilities etc.)
-# (2) a set of functions for parsing URLs
-# XXX Should these be separated out into different modules?
-
-
-# Shortcut for basic usage
-_urlopener = None
-def urlopen(url):
- global _urlopener
- if not _urlopener:
- _urlopener = URLopener()
- return _urlopener.open(url)
-def urlretrieve(url):
- global _urlopener
- if not _urlopener:
- _urlopener = URLopener()
- return _urlopener.retrieve(url)
-def urlcleanup():
- if _urlopener:
- _urlopener.cleanup()
-
-
-# Class to open URLs.
-# This is a class rather than just a subroutine because we may need
-# more than one set of global protocol-specific options.
-ftpcache = {}
-class URLopener:
-
- # Constructor
- def __init__(self):
- self.addheaders = []
- self.tempcache = {}
- self.ftpcache = ftpcache
- # Undocumented feature: you can use a different
- # ftp cache by assigning to the .ftpcache member;
- # in case you want logically independent URL openers
-
- def __del__(self):
- self.close()
-
- def close(self):
- self.cleanup()
-
- def cleanup(self):
- import os
- for url in self.tempcache.keys():
- try:
- os.unlink(self.tempcache[url][0])
- except os.error:
- pass
- del self.tempcache[url]
-
- # Add a header to be used by the HTTP interface only
- # e.g. u.addheader('Accept', 'sound/basic')
- def addheader(self, *args):
- self.addheaders.append(args)
-
- # External interface
- # Use URLopener().open(file) instead of open(file, 'r')
- def open(self, url):
- type, url = splittype(unwrap(url))
- if not type: type = 'file'
- name = 'open_' + type
- if '-' in name:
- import regsub
- name = regsub.gsub('-', '_', name)
- if not hasattr(self, name):
- raise IOError, ('url error', 'unknown url type', type)
- try:
- return getattr(self, name)(url)
- except socket.error, msg:
- raise IOError, ('socket error', msg)
-
- # External interface
- # retrieve(url) returns (filename, None) for a local object
- # or (tempfilename, headers) for a remote object
- def retrieve(self, url):
- if self.tempcache.has_key(url):
- return self.tempcache[url]
- url1 = unwrap(url)
- if self.tempcache.has_key(url1):
- self.tempcache[url] = self.tempcache[url1]
- return self.tempcache[url1]
- type, url1 = splittype(url1)
- if not type or type == 'file':
- try:
- fp = self.open_local_file(url1)
- del fp
- return splithost(url1)[1], None
- except IOError, msg:
- pass
- fp = self.open(url)
- headers = fp.info()
- import tempfile
- tfn = tempfile.mktemp()
- self.tempcache[url] = result = tfn, headers
- tfp = open(tfn, 'w')
- bs = 1024*8
- block = fp.read(bs)
- while block:
- tfp.write(block)
- block = fp.read(bs)
- del fp
- del tfp
- return result
-
- # Each method named open_<type> knows how to open that type of URL
-
- # Use HTTP protocol
- def open_http(self, url):
- import httplib
- host, selector = splithost(url)
- h = httplib.HTTP(host)
- h.putrequest('GET', selector)
- for args in self.addheaders: apply(h.putheader, args)
- errcode, errmsg, headers = h.getreply()
- if errcode == 200: return addinfo(h.getfile(), headers)
- else: raise IOError, ('http error', errcode, errmsg, headers)
-
- # Use Gopher protocol
- def open_gopher(self, url):
- import gopherlib
- host, selector = splithost(url)
- type, selector = splitgophertype(selector)
- selector, query = splitquery(selector)
- if query: fp = gopherlib.send_query(selector, query, host)
- else: fp = gopherlib.send_selector(selector, host)
- return addinfo(fp, noheaders())
-
- # Use local file or FTP depending on form of URL
- def open_file(self, url):
- try:
- return self.open_local_file(url)
- except IOError:
- return self.open_ftp(url)
-
- # Use local file
- def open_local_file(self, url):
- host, file = splithost(url)
- if not host: return addinfo(open(file, 'r'), noheaders())
- host, port = splitport(host)
- if not port and socket.gethostbyname(host) in (
- localhost(), thishost()):
- return addinfo(open(file, 'r'), noheaders())
- raise IOError, ('local file error', 'not on local host')
-
- # Use FTP protocol
- def open_ftp(self, url):
- host, file = splithost(url)
- if not host: raise IOError, ('ftp error', 'no host given')
- host, port = splitport(host)
- host = socket.gethostbyname(host)
- if not port:
- import ftplib
- port = ftplib.FTP_PORT
- key = (host, port)
- try:
- if not self.ftpcache.has_key(key):
- self.ftpcache[key] = ftpwrapper(host, port)
- return addinfo(self.ftpcache[key].retrfile(file),
- noheaders())
- except ftperrors(), msg:
- raise IOError, ('ftp error', msg)
-
-
-# Utility functions
-
-# Return the IP address of the magic hostname 'localhost'
-_localhost = None
-def localhost():
- global _localhost
- if not _localhost:
- _localhost = socket.gethostbyname('localhost')
- return _localhost
-
-# Return the IP address of the current host
-_thishost = None
-def thishost():
- global _thishost
- if not _thishost:
- _thishost = socket.gethostbyname(socket.gethostname())
- return _thishost
-
-# Return the set of errors raised by the FTP class
-_ftperrors = None
-def ftperrors():
- global _ftperrors
- if not _ftperrors:
- import ftplib
- _ftperrors = (ftplib.error_reply,
- ftplib.error_temp,
- ftplib.error_perm,
- ftplib.error_proto)
- return _ftperrors
-
-# Return an empty rfc822.Message object
-_noheaders = None
-def noheaders():
- global _noheaders
- if not _noheaders:
- import rfc822
- _noheaders = rfc822.Message(open('/dev/null', 'r'))
- _noheaders.fp.close() # Recycle file descriptor
- return _noheaders
-
-
-# Utility classes
-
-# Class used by open_ftp() for cache of open FTP connections
-class ftpwrapper:
- def __init__(self, host, port):
- self.host = host
- self.port = port
- self.init()
- def init(self):
- import ftplib
- self.ftp = ftplib.FTP()
- self.ftp.connect(self.host, self.port)
- self.ftp.login()
- def retrfile(self, file):
- import ftplib
- try:
- self.ftp.voidcmd('TYPE I')
- except ftplib.all_errors:
- self.init()
- self.ftp.voidcmd('TYPE I')
- conn = None
- if file:
- try:
- cmd = 'RETR ' + file
- conn = self.ftp.transfercmd(cmd)
- except ftplib.error_perm, reason:
- if reason[:3] != '550':
- raise IOError, ('ftp error', reason)
- if not conn:
- # Try a directory listing
- if file: cmd = 'LIST ' + file
- else: cmd = 'LIST'
- conn = self.ftp.transfercmd(cmd)
- return addclosehook(conn.makefile('r'), self.ftp.voidresp)
-
-# Base class for addinfo and addclosehook
-class addbase:
- def __init__(self, fp):
- self.fp = fp
- self.read = self.fp.read
- self.readline = self.fp.readline
- self.readlines = self.fp.readlines
- self.fileno = self.fp.fileno
- def __repr__(self):
- return '<%s at %s whose fp = %s>' % (
- self.__class__.__name__, `id(self)`, `self.fp`)
- def __del__(self):
- self.close()
- def close(self):
- self.read = None
- self.readline = None
- self.readlines = None
- self.fileno = None
- self.fp = None
-
-# Class to add a close hook to an open file
-class addclosehook(addbase):
- def __init__(self, fp, closehook, *hookargs):
- addbase.__init__(self, fp)
- self.closehook = closehook
- self.hookargs = hookargs
- def close(self):
- if self.closehook:
- apply(self.closehook, self.hookargs)
- self.closehook = None
- self.hookargs = None
- addbase.close(self)
-
-# class to add an info() method to an open file
-class addinfo(addbase):
- def __init__(self, fp, headers):
- addbase.__init__(self, fp)
- self.headers = headers
- def info(self):
- return self.headers
-
-
-# Utility to combine a URL with a base URL to form a new URL
-
-def basejoin(base, url):
- type, path = splittype(url)
- if type: return url
- host, path = splithost(path)
- basetype, basepath = splittype(base)
- basehost, basepath = splithost(basepath)
- basepath, basetag = splittag(basepath)
- basepath, basequery = splitquery(basepath)
- type = basetype or 'file'
- if path[:1] != '/':
- import string
- i = string.rfind(basepath, '/')
- if i < 0: basepath = '/'
- else: basepath = basepath[:i+1]
- path = basepath + path
- if not host: host = basehost
- if host: return type + '://' + host + path
- else: return type + ':' + path
-
-
-# Utilities to parse URLs:
-# unwrap('<URL:type//host/path>') --> 'type//host/path'
-# splittype('type:opaquestring') --> 'type', 'opaquestring'
-# splithost('//host[:port]/path') --> 'host[:port]', '/path'
-# splitport('host:port') --> 'host', 'port'
-# splitquery('/path?query') --> '/path', 'query'
-# splittag('/path#tag') --> '/path', 'tag'
-# splitgophertype('/Xselector') --> 'X', 'selector'
-
-def unwrap(url):
- import string
- url = string.strip(url)
- if url[:1] == '<' and url[-1:] == '>':
- url = string.strip(url[1:-1])
- if url[:4] == 'URL:': url = string.strip(url[4:])
- return url
-
-_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
-def splittype(url):
- if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
- return None, url
-
-_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
-def splithost(url):
- if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
- return None, url
-
-_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
-def splitport(host):
- if _portprog.match(host) >= 0: return _portprog.group(1, 2)
- return host, None
-
-_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
-def splitquery(url):
- if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
- return url, None
-
-_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
-def splittag(url):
- if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
- return url, None
-
-def splitgophertype(selector):
- if selector[:1] == '/' and selector[1:2]:
- return selector[1], selector[2:]
- return None, selector
-
-
-# Test program
-def test():
- import sys
- import regsub
- args = sys.argv[1:]
- if not args:
- args = [
- '/etc/passwd',
- 'file:/etc/passwd',
- 'file://localhost/etc/passwd',
- 'ftp://ftp.cwi.nl/etc/passwd',
- 'gopher://gopher.cwi.nl/11/',
- 'http://www.cwi.nl/index.html',
- ]
- try:
- for url in args:
- print '-'*10, url, '-'*10
- fn, h = urlretrieve(url)
- print fn, h
- if h:
- print '======'
- for k in h.keys(): print k + ':', h[k]
- print '======'
- fp = open(fn, 'r')
- data = fp.read()
- del fp
- print regsub.gsub('\r', '', data)
- fn, h = None, None
- print '-'*40
- finally:
- urlcleanup()
-
-# Run test program when run as a script
-if __name__ == '__main__':
- test()