From 23acc9590c89d1e230ba6f8faaba55f7250a0b88 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 21 Feb 1994 16:36:04 +0000 Subject: Towards a standard access mechanism for URLs. --- Lib/gopherlib.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++++ Lib/httplib.py | 129 ++++++++++++++++++++++++++++++++ Lib/urlopen.py | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 546 insertions(+) create mode 100644 Lib/gopherlib.py create mode 100644 Lib/httplib.py create mode 100755 Lib/urlopen.py diff --git a/Lib/gopherlib.py b/Lib/gopherlib.py new file mode 100644 index 0000000..71413ba --- /dev/null +++ b/Lib/gopherlib.py @@ -0,0 +1,195 @@ +# Gopher protocol client interface + +import string + +# Default selector, host and port +DEF_SELECTOR = '1/' +DEF_HOST = 'gopher.micro.umn.edu' +DEF_PORT = 70 + +# Recognized file types +A_TEXT = '0' +A_MENU = '1' +A_CSO = '2' +A_ERROR = '3' +A_MACBINHEX = '4' +A_PCBINHEX = '5' +A_UUENCODED = '6' +A_INDEX = '7' +A_TELNET = '8' +A_BINARY = '9' +A_DUPLICATE = '+' +A_SOUND = 's' +A_EVENT = 'e' +A_CALENDAR = 'c' +A_HTML = 'h' +A_TN3270 = 'T' +A_MIME = 'M' +A_IMAGE = 'I' +A_WHOIS = 'w' +A_QUERY = 'q' +A_GIF = 'g' +A_HTML = 'h' # HTML file +A_WWW = 'w' # WWW address +A_PLUS_IMAGE = ':' +A_PLUS_MOVIE = ';' +A_PLUS_SOUND = '<' + + +# Function mapping all file types to strings; unknown types become TYPE='x' +_names = dir() +_type_to_name_map = None +def type_to_name(gtype): + global _type_to_name_map + if not _type_to_name_map: + for name in _names: + if name[:2] == 'A_': + _type_to_name_map[eval(name)] = name[2:] + if _type_to_name_map.has_key(gtype): + return _type_to_name_map[gtype] + return 'TYPE=' + `gtype` + +# Names for characters and strings +CRLF = '\r\n' +TAB = '\t' + +# Send a selector to a given host and port, return a file with the reply +def send_selector(selector, host, *args): + import socket + import string + if args: + if args[1:]: raise TypeError, 'too many args' + port = args[0] + else: + port = None + i = string.find(host, ':') + if i >= 0: + host, port = host[:i], string.atoi(host[i+1:]) + if not port: + port = DEF_PORT + elif type(port) == type(''): + port = string.atoi(port) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect(host, port) + s.send(selector + CRLF) + s.shutdown(1) + return s.makefile('r') + +# Send a selector and a query string +def send_query(selector, query, host, *args): + return apply(send_selector, (selector + '\t' + query, host) + args) + +# The following functions interpret the data returned by the gopher +# server according to the expected type, e.g. textfile or directory + +# Get a directory in the form of a list of entries +def get_directory(f): + import string + list = [] + while 1: + line = f.readline() + if not line: + print '(Unexpected EOF from server)' + break + if line[-2:] == CRLF: + line = line[:-2] + elif line[-1:] in CRLF: + line = line[:-1] + if line == '.': + break + if not line: + print '(Empty line from server)' + continue + gtype = line[0] + parts = string.splitfields(line[1:], TAB) + if len(parts) < 4: + print '(Bad line from server:', `line`, ')' + continue + if len(parts) > 4: + if parts[4:] != ['+']: + print '(Extra info from server:', parts[4:], ')' + else: + parts.append('') + parts.insert(0, gtype) + list.append(parts) + return list + +# Get a text file as a list of lines, with trailing CRLF stripped +def get_textfile(f): + list = [] + get_alt_textfile(f, list.append) + return list + +# Get a text file and pass each line to a function, with trailing CRLF stripped +def get_alt_textfile(f, func): + while 1: + line = f.readline() + if not line: + print '(Unexpected EOF from server)' + break + if line[-2:] == CRLF: + line = line[:-2] + elif line[-1:] in CRLF: + line = line[:-1] + if line == '.': + break + if line[:2] == '..': + line = line[1:] + func(line) + +# Get a binary file as one solid data block +def get_binary(f): + data = f.read() + return data + +# Get a binary file and pass each block to a function +def get_alt_binary(f, func, blocksize): + while 1: + data = f.read(blocksize) + if not data: + break + func(data) + +# Trivial test program +def test(): + import sys + import getopt + opts, args = getopt.getopt(sys.argv[1:], '') + selector = DEF_SELECTOR + type = selector[0] + host = DEF_HOST + port = DEF_PORT + if args: + host = args[0] + args = args[1:] + if args: + type = args[0] + args = args[1:] + if len(type) > 1: + type, selector = type[0], type + else: + selector = '' + if args: + selector = args[0] + args = args[1:] + query = '' + if args: + query = args[0] + args = args[1:] + if type == A_INDEX: + f = send_query(selector, query, host) + else: + f = send_selector(selector, host) + if type == A_TEXT: + list = get_textfile(f) + for item in list: print item + elif type in (A_MENU, A_INDEX): + list = get_directory(f) + for item in list: print item + else: + data = get_binary(f) + print 'binary data:', len(data), 'bytes:', `data[:100]`[:40] + +# Run the test when run as script +if __name__ == '__main__': + test() diff --git a/Lib/httplib.py b/Lib/httplib.py new file mode 100644 index 0000000..ea6e565 --- /dev/null +++ b/Lib/httplib.py @@ -0,0 +1,129 @@ +# HTTP client class +# +# See the following document for a tentative protocol description: +# Hypertext Transfer Protocol (HTTP) Tim Berners-Lee, CERN +# Internet Draft 5 Nov 1993 +# draft-ietf-iiir-http-00.txt Expires 5 May 1994 +# +# Example: +# +# >>> from httplib import HTTP +# >>> h = HTTP('www.cwi.nl') +# >>> h.putreqest('GET', '/index.html') +# >>> h.putheader('Accept', 'text/html') +# >>> h.putheader('Accept', 'text/plain') +# >>> errcode, errmsg, headers = h.getreply() +# >>> if errcode == 200: +# ... f = h.getfile() +# ... print f.read() # Print the raw HTML +# ... +# Home Page of CWI, Amsterdam +# [...many more lines...] +# >>> +# +# Note that an HTTP object is used for a single request -- to issue a +# second request to the same server, you create a new HTTP object. +# (This is in accordance with the protocol, which uses a new TCP +# connection for each request.) + + +import os +import socket +import string +import regex +import regsub +import rfc822 + +HTTP_VERSION = 'HTTP/1.0' +HTTP_PORT = 80 + +replypat = regsub.gsub('\\.', '\\\\.', HTTP_VERSION) + \ + '[ \t]+\([0-9][0-9][0-9]\)\(.*\)' +replyprog = regex.compile(replypat) + +class HTTP: + + def __init__(self, *args): + self.debuglevel = 0 + if args: apply(self.connect, args) + + def set_debuglevel(self, debuglevel): + self.debuglevel = debuglevel + + def connect(self, host, *args): + if args: + if args[1:]: raise TypeError, 'too many args' + port = args[0] + else: + i = string.find(host, ':') + if i >= 0: + host, port = host[:i], host[i+1:] + try: port = string.atoi(port) + except string.atoi_error: port = None + if not port: port = HTTP_PORT + self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + if self.debuglevel > 0: print 'connect:', (host, port) + self.sock.connect(host, port) + + def send(self, str): + if self.debuglevel > 0: print 'send:', `str` + self.sock.send(str) + + def putrequest(self, request, selector): + str = '%s %s %s\r\n' % (request, selector, HTTP_VERSION) + self.send(str) + + def putheader(self, header, *args): + str = '%s: %s\r\n' % (header, string.joinfields(args,'\r\n\t')) + self.send(str) + + def endheaders(self): + self.send('\r\n') + + def endrequest(self): + if self.debuglevel > 0: print 'shutdown: 1' + self.sock.shutdown(1) + + def getreply(self): + self.endrequest() + self.file = self.sock.makefile('r') + line = self.file.readline() + if self.debuglevel > 0: print 'reply:', `line` + if replyprog.match(line) < 0: + self.headers = None + return -1, line, self.headers + errcode, errmsg = replyprog.group(1, 2) + errcode = string.atoi(errcode) + errmsg = string.strip(errmsg) + self.headers = rfc822.Message(self.file) + return errcode, errmsg, self.headers + + def getfile(self): + return self.file + + +def test(): + import sys + import getopt + opts, args = getopt.getopt(sys.argv[1:], 'd') + dl = 0 + for o, a in opts: + if o == '-d': dl = dl + 1 + host = 'www.cwi.nl:80' + selector = '/index.html' + if args[0:]: host = args[0] + if args[1:]: selector = args[1] + h = HTTP() + h.set_debuglevel(dl) + h.connect(host) + h.putrequest('GET', selector) + errcode, errmsg, headers = h.getreply() + print 'errcode =', errcode + print 'headers =', headers + print 'errmsg =', errmsg + if headers: + for header in headers.headers: print string.strip(header) + print h.getfile().read() + +if __name__ == '__main__': + test() diff --git a/Lib/urlopen.py b/Lib/urlopen.py new file mode 100755 index 0000000..c24b3f9 --- /dev/null +++ b/Lib/urlopen.py @@ -0,0 +1,222 @@ +# Open an arbitrary URL +# +# See the following document for a tentative description of URLs: +# Uniform Resource Locators Tim Berners-Lee +# INTERNET DRAFT CERN +# IETF URL Working Group 14 July 1993 +# draft-ietf-uri-url-01.txt +# +# The object returned by urlopen() will differ per protocol. +# All you know is that is has methods read(), fileno(), close() and info(). +# The read(), fileno() and close() methods work like those of open files. +# The info() method returns an rfc822.Message object which can be +# used to query various info about the object, if available. +# (rfc822.Message objects are queried with the getheader() method.) + +import socket +import regex +import regsub +import string +import rfc822 +import ftplib + + +# External interface -- use urlopen(file) as if it were open(file, 'r') +def urlopen(url): + url = string.strip(url) + if url[:1] == '<' and url[-1:] == '>': url = string.strip(url[1:-1]) + if url[:4] == 'URL:': url = string.strip(url[4:]) + type, url = splittype(url) + if not type: type = 'file' + type = regsub.gsub('-', '_', type) + try: + func = eval('open_' + type) + except NameError: + raise IOError, ('url error', 'unknown url type', type) + try: + return func(url) + except socket.error, msg: + raise IOError, ('socket error', msg) + + +# Each routine of the form open_ knows how to open that type of URL + +# Use HTTP protocol +def open_http(url): + import httplib + host, selector = splithost(url) + h = httplib.HTTP(host) + h.putrequest('GET', selector) + errcode, errmsg, headers = h.getreply() + if errcode == 200: return makefile(h.getfile(), headers) + else: raise IOError, ('http error', errcode, errmsg, headers) + +# Empty rfc822.Message object +noheaders = rfc822.Message(open('/dev/null', 'r')) +noheaders.fp.close() # Recycle file descriptor + +# Use Gopher protocol +def open_gopher(url): + import gopherlib + host, selector = splithost(url) + type, selector = splitgophertype(selector) + selector, query = splitquery(selector) + if query: fp = gopherlib.send_query(selector, query, host) + else: fp = gopherlib.send_selector(selector, host) + return makefile(fp, noheaders) + +# Use local file or FTP depending on form of URL +localhost = socket.gethostbyname('localhost') +thishost = socket.gethostbyname(socket.gethostname()) +def open_file(url): + host, file = splithost(url) + if not host: return makefile(open(file, 'r'), noheaders) + host, port = splitport(host) + if not port and socket.gethostbyname(host) in (localhost, thishost): + try: fp = open(file, 'r') + except IOError: fp = None + if fp: return makefile(fp, noheaders) + return open_ftp(url) + +# Use FTP protocol +ftpcache = {} +ftperrors = (ftplib.error_reply, + ftplib.error_temp, + ftplib.error_perm, + ftplib.error_proto) +def open_ftp(url): + host, file = splithost(url) + host, port = splitport(host) + host = socket.gethostbyname(host) + if not port: port = ftplib.FTP_PORT + key = (host, port) + try: + if not ftpcache.has_key(key): + ftpcache[key] = ftpwrapper(host, port) + return makefile(ftpcache[key].retrfile(file), noheaders) + except ftperrors, msg: + raise IOError, ('ftp error', msg) + + +# Utility classes + +# Class used to add an info() method to a file object +class makefile: + def __init__(self, fp, headers): + self.fp = fp + self.headers = headers + self.read = self.fp.read + self.fileno = self.fp.fileno + self.close = self.fp.close + def info(self): + return self.headers + +# Class used by open_ftp() for cache of open FTP connections +class ftpwrapper: + def __init__(self, host, port): + self.host = host + self.port = port + self.init() + def init(self): + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port) + self.ftp.login() + def retrfile(self, file): + try: + self.ftp.voidcmd('TYPE I') + except ftplib.all_errors: + self.init() + self.ftp.voidcmd('TYPE I') + conn = None + if file: + try: + cmd = 'RETR ' + file + conn = self.ftp.transfercmd(cmd) + except ftplib.error_perm, reason: + if reason[:3] != '550': + raise IOError, ('ftp error', reason) + if not conn: + # Try a directory listing + if file: cmd = 'NLST ' + file + else: cmd = 'NLST' + conn = self.ftp.transfercmd(cmd) + return fakefile(self.ftp, conn) + +# Class used by ftpwrapper to handle response when transfer is complete +class fakefile: + def __init__(self, ftp, conn): + self.ftp = ftp + self.conn = conn + self.fp = self.conn.makefile('r') + self.read = self.fp.read + self.fileno = self.fp.fileno + def __del__(self): + self.close() + def close(self): + self.conn = None + self.fp = None + self.read = None + if self.ftp: self.ftp.voidresp() + self.ftp = None + + +# Utilities to split url parts into components: +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitgophertype('/Xselector') --> 'X', 'selector' + +typeprog = regex.compile('^\([^/:]+\):\(.*\)$') +def splittype(url): + if typeprog.match(url) >= 0: return typeprog.group(1, 2) + return None, url + +hostprog = regex.compile('^//\([^/]+\)\(.*\)$') +def splithost(url): + if hostprog.match(url) >= 0: return hostprog.group(1, 2) + return None, url + +portprog = regex.compile('^\(.*\):\([0-9]+\)$') +def splitport(host): + if portprog.match(host) >= 0: return portprog.group(1, 2) + return host, None + +queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') +def splitquery(url): + if queryprog.match(url) >= 0: return queryprog.group(1, 2) + return url, None + +tagprog = regex.compile('^\(.*\)#\([^#]*\)$') +def splittag(url): + if tagprog.match(url) >= 0: return tagprog.group(1, 2) + return url, None + +def splitgophertype(selector): + if selector[:1] == '/' and selector[1:2]: + return selector[1], selector[2:] + return None, selector + + +# Test program +def test(): + import sys + args = sys.argv[1:] + if not args: + args = [ + '/etc/passwd', + 'file:/etc/passwd', + 'file://localhost/etc/passwd', + 'ftp://ftp.cwi.nl/etc/passwd', + 'gopher://gopher.cwi.nl/11/', + 'http://www.cwi.nl/index.html', + ] + for arg in args: + print '-'*10, arg, '-'*10 + print regsub.gsub('\r', '', urlopen(arg).read()) + print '-'*40 + +# Run test program when run as a script +if __name__ == '__main__': + test() -- cgit v0.12