summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1997-11-22 21:49:19 (GMT)
committerGuido van Rossum <guido@python.org>1997-11-22 21:49:19 (GMT)
commitbe7c45eec4abe3cbf12c007284360e8fd885e7cf (patch)
tree3f419f8661a98ae49fa409270f3435e059dfd5fc
parente6c128f428de3e6cb44549983644fa610082cb52 (diff)
downloadcpython-be7c45eec4abe3cbf12c007284360e8fd885e7cf.zip
cpython-be7c45eec4abe3cbf12c007284360e8fd885e7cf.tar.gz
cpython-be7c45eec4abe3cbf12c007284360e8fd885e7cf.tar.bz2
New address parser by Ben Escoto replaces
Sjoerd Mullender's parseaddr()
-rw-r--r--Lib/rfc822.py372
1 files changed, 247 insertions, 125 deletions
diff --git a/Lib/rfc822.py b/Lib/rfc822.py
index 70ac1f8..d0e5bcb 100644
--- a/Lib/rfc822.py
+++ b/Lib/rfc822.py
@@ -221,27 +221,24 @@ class Message:
# ('Guido van Rossum', 'guido@cwi.nl').
def getaddr(self, name):
- try:
- data = self[name]
- except KeyError:
- return None, None
- return parseaddr(data)
+ # New, by Ben Escoto
+ alist = self.getaddrlist(name)
+ if alist:
+ return alist[0]
+ else:
+ return (None, None)
# Retrieve a list of addresses from a header, where each
# address is a tuple as returned by getaddr().
def getaddrlist(self, name):
- # XXX This function is not really correct. The split
- # on ',' might fail in the case of commas within
- # quoted strings.
+ # New, by Ben Escoto
try:
data = self[name]
except KeyError:
return []
- data = string.splitfields(data, ',')
- for i in range(len(data)):
- data[i] = parseaddr(data[i])
- return data
+ a = AddrlistClass(data)
+ return a.getaddrlist()
# Retrieve a date field from a header as a tuple compatible
# with time.mktime().
@@ -291,7 +288,7 @@ class Message:
# Utility functions
# -----------------
-# XXX Should fix these to be really conformant.
+# XXX Should fix unquote() and quote() to be really conformant.
# XXX The inverses of the parse functions may also be useful.
@@ -306,12 +303,7 @@ def unquote(str):
return str
-# Parse an address into (name, address) tuple
-# (By Sjoerd Mullender)
-
-error = 'parseaddr.error'
-
-specials = re.compile(r'[][()<>,.;:@\" \000-\037\177-\377]')
+# Add quotes around a string.
def quote(str):
return '"%s"' % string.join(
@@ -322,114 +314,244 @@ def quote(str):
'"'),
'\\"')
+
+# External interface to parse an address
+
def parseaddr(address):
- token = [] # the current token
- tokens = [] # the list of tokens
- backslash = 0
- dquote = 0
- was_quoted = 0
- space = 0
- paren = 0
- for c in address:
- if backslash:
- token.append(c)
- backslash = 0
- if c == '\\':
- backslash = 1
- was_quoted = 1
- continue
- if dquote:
- if c == '"':
- dquote = 0
- else:
- token.append(c)
- continue
- if c == '"':
- dquote = 1
- was_quoted = 1
- continue
- if paren:
- if c == '(':
- paren = paren + 1
- elif c == ')':
- paren = paren - 1
- if paren == 0:
- token = string.join(token, '')
- tokens.append((2, token))
- token = []
- continue
- token.append(c)
- continue
- if c == '(':
- paren = 1
- token = string.join(token, '')
- tokens.append((was_quoted, token))
- was_quoted = 0
- token = []
- continue
- if c in string.whitespace:
- space = 1
- continue
- if c in '<>@,;:.[]':
- token = string.join(token, '')
- tokens.append((was_quoted, token))
- was_quoted = 0
- token = []
- tokens.append((0, c))
- space = 0
- continue
- if space:
- token = string.join(token, '')
- tokens.append((was_quoted, token))
- was_quoted = 0
- token = []
- space = 0
- token.append(c)
- token = string.join(token, '')
- tokens.append((was_quoted, token))
- if (0, '<') in tokens:
- name = []
- addr = []
- cur = name
- for token in tokens:
- if token[1] == '':
- continue
- if token == (0, '<'):
- if addr:
- raise error, 'syntax error'
- cur = addr
- elif token == (0, '>'):
- if cur is not addr:
- raise error, 'syntax error'
- cur = name
- elif token[0] == 2:
- if cur is name:
- name.append('(' + token[1] + ')')
- else:
- name.append(token[1])
- elif token[0] == 1 and cur is addr:
- if specials.search(token[1]):
- cur.append(quote(token[1]))
- else:
- cur.append(token[1])
- else:
- cur.append(token[1])
+ a = AddrlistClass(address)
+ list = a.getaddrlist()
+ if not list:
+ return (None, None)
+ else:
+ return list[0]
+
+
+# Address parser class by Ben Escoto
+
+class AddrlistClass:
+
+ def __init__(self, field):
+
+ self.specials = '()<>@,:;.\"[]'
+ self.pos = 0
+ self.LWS = ' \t'
+ self.CR = '\r'
+ self.atomends = self.specials + self.LWS + self.CR
+
+ self.field = field
+ self.commentlist = []
+
+
+ def gotonext(self):
+
+ while self.pos < len(self.field):
+ if self.field[self.pos] in self.LWS + '\n\r':
+ self.pos = self.pos + 1
+ elif self.field[self.pos] == '(':
+ self.commentlist.append(self.getcomment())
+ else: break
+
+ def getaddrlist(self):
+
+ ad = self.getaddress()
+ if ad:
+ return ad + self.getaddrlist()
+ else: return []
+
+ def getaddress(self):
+ self.commentlist = []
+ self.gotonext()
+
+ oldpos = self.pos
+ oldcl = self.commentlist
+ plist = self.getphraselist()
+
+ self.gotonext()
+ returnlist = []
+
+ if self.pos >= len(self.field):
+ # Bad email address technically, no domain.
+ if plist:
+ returnlist = [(string.join(self.commentlist), plist[0])]
+
+ elif self.field[self.pos] in '.@':
+ # email address is just an addrspec
+ # this isn't very efficient since we start over
+ self.pos = oldpos
+ self.commentlist = oldcl
+ addrspec = self.getaddrspec()
+ returnlist = [(string.join(self.commentlist), addrspec)]
+
+ elif self.field[self.pos] == ':':
+ # address is a group
+ returnlist = []
+
+ self.pos = self.pos + 1
+ while self.pos < len(self.field):
+ self.gotonext()
+ if self.field[self.pos] == ';':
+ self.pos = self.pos + 1
+ break
+ returnlist = returnlist + self.getaddress()
+
+ elif self.field[self.pos] == '<':
+ # Address is a phrase then a route addr
+ routeaddr = self.getrouteaddr()
+
+ if self.commentlist:
+ returnlist = [(string.join(plist) + ' (' + \
+ string.join(self.commentlist) + ')', routeaddr)]
+ else: returnlist = [(string.join(plist), routeaddr)]
+
else:
- name = []
- addr = []
- for token in tokens:
- if token[1] == '':
- continue
- if token[0] == 2:
- name.append(token[1])
- elif token[0] == 1:
- if specials.search(token[1]):
- addr.append(quote(token[1]))
- else:
- addr.append(token[1])
- else:
- addr.append(token[1])
- return string.join(name, ' '), string.join(addr, '')
+ if plist:
+ returnlist = [(string.join(self.commentlist), plist[0])]
+
+ self.gotonext()
+ if self.pos < len(self.field) and self.field[self.pos] == ',':
+ self.pos = self.pos + 1
+ return returnlist
+
+
+ def getrouteaddr(self):
+ # This just skips all the route stuff and returns the addrspec
+ if self.field[self.pos] != '<':
+ return
+
+ expectroute = 0
+ self.pos = self.pos + 1
+ self.gotonext()
+ while self.pos < len(self.field):
+ if expectroute:
+ self.getdomain()
+ expectroute = 0
+ elif self.field[self.pos] == '>':
+ self.pos = self.pos + 1
+ break
+ elif self.field[self.pos] == '@':
+ self.pos = self.pos + 1
+ expectroute = 1
+ elif self.field[self.pos] == ':':
+ self.pos = self.pos + 1
+ expectaddrspec = 1
+ else:
+ adlist = self.getaddrspec()
+ self.pos = self.pos + 1
+ break
+ self.gotonext()
+
+ return adlist
+
+
+ def getaddrspec(self):
+
+ aslist = []
+
+ self.gotonext()
+ while self.pos < len(self.field):
+ if self.field[self.pos] == '.':
+ aslist.append('.')
+ self.pos = self.pos + 1
+ elif self.field[self.pos] == '"':
+ aslist.append(self.getquote())
+ elif self.field[self.pos] in self.atomends:
+ break
+ else: aslist.append(self.getatom())
+ self.gotonext()
+
+ if self.pos >= len(self.field) or self.field[self.pos] != '@':
+ return string.join(aslist, '')
+
+ aslist.append('@')
+ self.pos = self.pos + 1
+ self.gotonext()
+ return string.join(aslist, '') + self.getdomain()
+
+
+ def getdomain(self):
+
+ sdlist = []
+ while self.pos < len(self.field):
+ if self.field[self.pos] in self.LWS:
+ self.pos = self.pos + 1
+ elif self.field[self.pos] == '(':
+ self.commentlist.append(self.getcomment())
+ elif self.field[self.pos] == '[':
+ sdlist.append(self.getdomainliteral())
+ elif self.field[self.pos] == '.':
+ self.pos = self.pos + 1
+ sdlist.append('.')
+ elif self.field[self.pos] in self.atomends:
+ break
+ else: sdlist.append(self.getatom())
+
+ return string.join(sdlist, '')
+
+
+ def getdelimited(self, beginchar, endchars, allowcomments = 1):
+
+ if self.field[self.pos] != beginchar:
+ return ''
+
+ slist = ['']
+ quote = 0
+ self.pos = self.pos + 1
+ while self.pos < len(self.field):
+ if quote == 1:
+ slist.append(self.field[self.pos])
+ quote = 0
+ elif self.field[self.pos] in endchars:
+ self.pos = self.pos + 1
+ break
+ elif allowcomments and self.field[self.pos] == '(':
+ slist.append(self.getcomment())
+ elif self.field[self.pos] == '\\':
+ quote = 1
+ else:
+ slist.append(self.field[self.pos])
+ self.pos = self.pos + 1
+
+ return string.join(slist, '')
+
+ def getquote(self):
+ return self.getdelimited('"', '"\r', 0)
+
+ def getcomment(self):
+ return self.getdelimited('(', ')\r', 1)
+
+ def getdomainliteral(self):
+ return self.getdelimited('[', ']\r', 0)
+
+
+ def getatom(self):
+
+ atomlist = ['']
+
+ while self.pos < len(self.field):
+ if self.field[self.pos] in self.atomends:
+ break
+ else: atomlist.append(self.field[self.pos])
+ self.pos = self.pos + 1
+
+ return string.join(atomlist, '')
+
+
+ def getphraselist(self):
+
+ plist = []
+
+ while self.pos < len(self.field):
+ if self.field[self.pos] in self.LWS:
+ self.pos = self.pos + 1
+ elif self.field[self.pos] == '"':
+ plist.append(self.getquote())
+ elif self.field[self.pos] == '(':
+ self.commentlist.append(self.getcomment())
+ elif self.field[self.pos] in self.atomends:
+ break
+ else: plist.append(self.getatom())
+
+ return plist
# Parse a date field