diff options
-rw-r--r-- | Doc/lib/libstring.tex | 150 | ||||
-rw-r--r-- | Lib/sre.py | 9 | ||||
-rw-r--r-- | Lib/sre_constants.py | 3 | ||||
-rw-r--r-- | Lib/sre_parse.py | 34 | ||||
-rw-r--r-- | Lib/string.py | 140 | ||||
-rw-r--r-- | Lib/test/test_pep292.py | 84 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
7 files changed, 329 insertions, 93 deletions
diff --git a/Doc/lib/libstring.tex b/Doc/lib/libstring.tex index 48d7fc4..2824aeb 100644 --- a/Doc/lib/libstring.tex +++ b/Doc/lib/libstring.tex @@ -4,11 +4,23 @@ \declaremodule{standard}{string} \modulesynopsis{Common string operations.} +The \module{string} package contains a number of useful constants and classes, +as well as some deprecated legacy functions that are also available as methods +on strings. See the module \refmodule{re}\refstmodindex{re} for string +functions based on regular expressions. -This module defines some constants useful for checking character -classes and some useful string functions. See the module -\refmodule{re}\refstmodindex{re} for string functions based on regular -expressions. +In general, all of these objects are exposed directly in the \module{string} +package so users need only import the \module{string} package to begin using +these constants, classes, and functions. + +\begin{notice} +Starting with Python 2.4, the traditional \module{string} module was turned +into a package, however backward compatibility with existing code has been +retained. Code using the \module{string} module that worked prior to Python +2.4 should continue to work unchanged. +\end{notice} + +\subsection{String constants} The constants defined in this module are: @@ -86,11 +98,113 @@ The constants defined in this module are: is undefined. \end{datadesc} +\subsection{Template strings} + +Templates are Unicode strings that can be used to provide string substitutions +as described in \pep{292}. There is a \class{Template} class that is a +subclass of \class{unicode}, overriding the default \method{__mod__()} method. +Instead of the normal \samp{\%}-based substitutions, Template strings support +\samp{\$}-based substitutions, using the following rules: + +\begin{itemize} +\item \samp{\$\$} is an escape; it is replaced with a single \samp{\$}. + +\item \samp{\$identifier} names a substitution placeholder matching a mapping + key of "identifier". By default, "identifier" must spell a Python + identifier. The first non-identifier character after the \samp{\$} + character terminates this placeholder specification. + +\item \samp{\$\{identifier\}} is equivalent to \samp{\$identifier}. It is + required when valid identifier characters follow the placeholder but are + not part of the placeholder, e.g. "\$\{noun\}ification". +\end{itemize} + +Any other appearance of \samp{\$} in the string will result in a +\exception{ValueError} being raised. + +Template strings are used just like normal strings, in that the modulus +operator is used to interpolate a dictionary of values into a Template string, +e.g.: + +\begin{verbatim} +>>> from string import Template +>>> s = Template('$who likes $what') +>>> print s % dict(who='tim', what='kung pao') +tim likes kung pao +>>> Template('Give $who $100') % dict(who='tim') +Traceback (most recent call last): +[...] +ValueError: Invalid placeholder at index 10 +\end{verbatim} + +There is also a \class{SafeTemplate} class, derived from \class{Template} +which acts the same as \class{Template}, except that if placeholders are +missing in the interpolation dictionary, no \exception{KeyError} will be +raised. Instead the original placeholder (with or without the braces, as +appropriate) will be used: + +\begin{verbatim} +>>> from string import SafeTemplate +>>> s = SafeTemplate('$who likes $what for ${meal}') +>>> print s % dict(who='tim') +tim likes $what for ${meal} +\end{verbatim} + +The values in the mapping will automatically be converted to Unicode strings, +using the built-in \function{unicode()} function, which will be called without +optional arguments \var{encoding} or \var{errors}. + +Advanced usage: you can derive subclasses of \class{Template} or +\class{SafeTemplate} to use application-specific placeholder rules. To do +this, you override the class attribute \member{pattern}; the value must be a +compiled regular expression object with four named capturing groups. The +capturing groups correspond to the rules given above, along with the invalid +placeholder rule: + +\begin{itemize} +\item \var{escaped} -- This group matches the escape sequence, i.e. \samp{\$\$} + in the default pattern. +\item \var{named} -- This group matches the unbraced placeholder name; it + should not include the \samp{\$} in capturing group. +\item \var{braced} -- This group matches the brace delimited placeholder name; + it should not include either the \samp{\$} or braces in the capturing + group. +\item \var{bogus} -- This group matches any other \samp{\$}. It usually just + matches a single \samp{\$} and should appear last. +\end{itemize} + +\subsection{String functions} + +The following functions are available to operate on string and Unicode +objects. They are not available as string methods. + +\begin{funcdesc}{capwords}{s} + Split the argument into words using \function{split()}, capitalize + each word using \function{capitalize()}, and join the capitalized + words using \function{join()}. Note that this replaces runs of + whitespace characters by a single space, and removes leading and + trailing whitespace. +\end{funcdesc} + +\begin{funcdesc}{maketrans}{from, to} + Return a translation table suitable for passing to + \function{translate()} or \function{regex.compile()}, that will map + each character in \var{from} into the character at the same position + in \var{to}; \var{from} and \var{to} must have the same length. + + \warning{Don't use strings derived from \constant{lowercase} + and \constant{uppercase} as arguments; in some locales, these don't have + the same length. For case conversions, always use + \function{lower()} and \function{upper()}.} +\end{funcdesc} -Many of the functions provided by this module are also defined as -methods of string and Unicode objects; see ``String Methods'' (section -\ref{string-methods}) for more information on those. -The functions defined in this module are: +\subsection{Deprecated string functions} + +The following list of functions are also defined as methods of string and +Unicode objects; see ``String Methods'' (section +\ref{string-methods}) for more information on those. You should consider +these functions as deprecated, although they will not be removed until Python +3.0. The functions defined in this module are: \begin{funcdesc}{atof}{s} \deprecated{2.0}{Use the \function{float()} built-in function.} @@ -138,14 +252,6 @@ The functions defined in this module are: Return a copy of \var{word} with only its first character capitalized. \end{funcdesc} -\begin{funcdesc}{capwords}{s} - Split the argument into words using \function{split()}, capitalize - each word using \function{capitalize()}, and join the capitalized - words using \function{join()}. Note that this replaces runs of - whitespace characters by a single space, and removes leading and - trailing whitespace. -\end{funcdesc} - \begin{funcdesc}{expandtabs}{s\optional{, tabsize}} Expand tabs in a string, i.e.\ replace them by one or more spaces, depending on the current column and the given tab size. The column @@ -188,18 +294,6 @@ The functions defined in this module are: lower case. \end{funcdesc} -\begin{funcdesc}{maketrans}{from, to} - Return a translation table suitable for passing to - \function{translate()} or \function{regex.compile()}, that will map - each character in \var{from} into the character at the same position - in \var{to}; \var{from} and \var{to} must have the same length. - - \warning{Don't use strings derived from \constant{lowercase} - and \constant{uppercase} as arguments; in some locales, these don't have - the same length. For case conversions, always use - \function{lower()} and \function{upper()}.} -\end{funcdesc} - \begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}} Return a list of the words of the string \var{s}. If the optional second argument \var{sep} is absent or \code{None}, the words are @@ -105,9 +105,6 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", __version__ = "2.2.1" -# this module works under 1.5.2 and later. don't use string methods -import string - # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale @@ -201,7 +198,7 @@ def escape(pattern): s[i] = "\\000" else: s[i] = "\\" + c - return _join(s, pattern) + return pattern[:0].join(s) # -------------------------------------------------------------------- # internals @@ -213,10 +210,6 @@ _pattern_type = type(sre_compile.compile("", 0)) _MAXCACHE = 100 -def _join(seq, sep): - # internal: join into string having the same type as sep - return string.join(seq, sep[:0]) - def _compile(*key): # internal: compile pattern cachekey = (type(key[0]),) + key diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 002b195..1863f48 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -217,12 +217,11 @@ SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": - import string def dump(f, d, prefix): items = d.items() items.sort(key=lambda a: a[1]) for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v)) + f.write("#define %s_%s %s\n" % (prefix, k.upper(), v)) f = open("sre_constants.h", "w") f.write("""\ /* diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 94d526d..5c4298a 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -12,8 +12,7 @@ # XXX: show string offset and offending character for all errors -# this module works under 1.5.2 and later. don't use string methods -import string, sys +import sys from sre_constants import * @@ -63,13 +62,6 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } -# figure out best way to convert hex/octal numbers to integers -try: - int("10", 8) - atoi = int # 2.0 and later -except TypeError: - atoi = string.atoi # 1.5.2 - class Pattern: # master pattern object. keeps track of global attributes def __init__(self): @@ -233,7 +225,7 @@ def isname(name): def _group(escape, groups): # check if the escape string represents a valid group try: - gid = atoi(escape[1:]) + gid = int(escape[1:]) if gid and gid < groups: return gid except ValueError: @@ -256,13 +248,13 @@ def _class_escape(source, escape): escape = escape[2:] if len(escape) != 2: raise error, "bogus escape: %s" % repr("\\" + escape) - return LITERAL, atoi(escape, 16) & 0xff + return LITERAL, int(escape, 16) & 0xff elif escape[1:2] in OCTDIGITS: # octal escape (up to three digits) while source.next in OCTDIGITS and len(escape) < 5: escape = escape + source.get() escape = escape[1:] - return LITERAL, atoi(escape, 8) & 0xff + return LITERAL, int(escape, 8) & 0xff if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -284,12 +276,12 @@ def _escape(source, escape, state): escape = escape + source.get() if len(escape) != 4: raise ValueError - return LITERAL, atoi(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) & 0xff elif escape[1:2] == "0": # octal escape while source.next in OCTDIGITS and len(escape) < 4: escape = escape + source.get() - return LITERAL, atoi(escape[1:], 8) & 0xff + return LITERAL, int(escape[1:], 8) & 0xff elif escape[1:2] in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: @@ -298,7 +290,7 @@ def _escape(source, escape, state): source.next in OCTDIGITS): # got three octal digits; this is an octal escape escape = escape + source.get() - return LITERAL, atoi(escape[1:], 8) & 0xff + return LITERAL, int(escape[1:], 8) & 0xff # got at least one decimal digit; this is a group reference group = _group(escape, state.groups) if group: @@ -503,9 +495,9 @@ def _parse(source, state): source.seek(here) continue if lo: - min = atoi(lo) + min = int(lo) if hi: - max = atoi(hi) + max = int(hi) if max < min: raise error, "bad repeat interval" else: @@ -617,7 +609,7 @@ def _parse(source, state): raise error, "unknown group name" else: try: - condgroup = atoi(condname) + condgroup = int(condname) except ValueError: raise error, "bad character in group name" else: @@ -730,7 +722,7 @@ def parse_template(source, pattern): if not name: raise error, "bad group name" try: - index = atoi(name) + index = int(name) except ValueError: if not isname(name): raise error, "bad character in group name" @@ -754,7 +746,7 @@ def parse_template(source, pattern): break if not code: this = this[1:] - code = LITERAL, makechar(atoi(this[-6:], 8) & 0xff) + code = LITERAL, makechar(int(this[-6:], 8) & 0xff) if code[0] is LITERAL: literal(code[1]) else: @@ -793,4 +785,4 @@ def expand_template(template, match): raise IndexError except IndexError: raise error, "empty group" - return string.join(literals, sep) + return sep.join(literals) diff --git a/Lib/string.py b/Lib/string.py index bc10c20..d166f38 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -35,10 +35,116 @@ printable = digits + letters + punctuation + whitespace # Case conversion helpers # Use str to convert Unicode literal in case of -U +# Note that Cookie.py bogusly uses _idmap :( l = map(chr, xrange(256)) _idmap = str('').join(l) del l +# Functions which aren't available as string methods. + +# Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def". +# See also regsub.capwords(). +def capwords(s, sep=None): + """capwords(s, [sep]) -> string + + Split the argument into words using split, capitalize each + word using capitalize, and join the capitalized words using + join. Note that this replaces runs of whitespace characters by + a single space. + + """ + return (sep or ' ').join([x.capitalize() for x in s.split(sep)]) + + +# Construct a translation string +_idmapL = None +def maketrans(fromstr, tostr): + """maketrans(frm, to) -> string + + Return a translation table (a string of 256 bytes long) + suitable for use in string.translate. The strings frm and to + must be of the same length. + + """ + if len(fromstr) != len(tostr): + raise ValueError, "maketrans arguments must have same length" + global _idmapL + if not _idmapL: + _idmapL = map(None, _idmap) + L = _idmapL[:] + fromstr = map(ord, fromstr) + for i in range(len(fromstr)): + L[fromstr[i]] = tostr[i] + return ''.join(L) + + + +import re as _re + +class Template(unicode): + """A string class for supporting $-substitutions.""" + __slots__ = [] + + # Search for $$, $identifier, ${identifier}, and any bare $'s + pattern = _re.compile(r""" +# Match exactly two $'s -- this is the escape sequence +(?P<escaped>\${2})| +# Match a $ followed by a Python identifier +\$(?P<named>[_a-z][_a-z0-9]*)| +# Match a $ followed by a brace delimited identifier +\${(?P<braced>[_a-z][_a-z0-9]*)}| +# Match any other $'s +(?P<bogus>\$) +""", _re.IGNORECASE | _re.VERBOSE) + + def __mod__(self, mapping): + def convert(mo): + groups = mo.groupdict() + if groups.get('escaped') is not None: + return '$' + if groups.get('bogus') is not None: + raise ValueError('Invalid placeholder at index %d' % + mo.start('bogus')) + val = mapping[groups.get('named') or groups.get('braced')] + return unicode(val) + return self.pattern.sub(convert, self) + + +class SafeTemplate(Template): + """A string class for supporting $-substitutions. + + This class is 'safe' in the sense that you will never get KeyErrors if + there are placeholders missing from the interpolation dictionary. In that + case, you will get the original placeholder in the value string. + """ + __slots__ = [] + + def __mod__(self, mapping): + def convert(mo): + groups = mo.groupdict() + if groups.get('escaped') is not None: + return '$' + if groups.get('bogus') is not None: + raise ValueError('Invalid placeholder at index %d' % + mo.start('bogus')) + named = groups.get('named') + if named is not None: + try: + return unicode(mapping[named]) + except KeyError: + return '$' + named + braced = groups.get('braced') + try: + return unicode(mapping[braced]) + except KeyError: + return '${' + braced + '}' + return self.pattern.sub(convert, self) + + + +# NOTE: Everything below here is deprecated. Use string methods instead. +# This stuff will go away in Python 3.0. + # Backward compatible names for exceptions index_error = ValueError atoi_error = ValueError @@ -336,40 +442,6 @@ def capitalize(s): """ return s.capitalize() -# Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def". -# See also regsub.capwords(). -def capwords(s, sep=None): - """capwords(s, [sep]) -> string - - Split the argument into words using split, capitalize each - word using capitalize, and join the capitalized words using - join. Note that this replaces runs of whitespace characters by - a single space. - - """ - return join(map(capitalize, s.split(sep)), sep or ' ') - -# Construct a translation string -_idmapL = None -def maketrans(fromstr, tostr): - """maketrans(frm, to) -> string - - Return a translation table (a string of 256 bytes long) - suitable for use in string.translate. The strings frm and to - must be of the same length. - - """ - if len(fromstr) != len(tostr): - raise ValueError, "maketrans arguments must have same length" - global _idmapL - if not _idmapL: - _idmapL = map(None, _idmap) - L = _idmapL[:] - fromstr = map(ord, fromstr) - for i in range(len(fromstr)): - L[fromstr[i]] = tostr[i] - return join(L, "") - # Substring replacement (global) def replace(s, old, new, maxsplit=-1): """replace (str, old, new[, maxsplit]) -> string diff --git a/Lib/test/test_pep292.py b/Lib/test/test_pep292.py new file mode 100644 index 0000000..7eff309 --- /dev/null +++ b/Lib/test/test_pep292.py @@ -0,0 +1,84 @@ +# Copyright (C) 2004 Python Software Foundation +# Author: barry@python.org (Barry Warsaw) +# License: http://www.opensource.org/licenses/PythonSoftFoundation.php + +import unittest +from string import Template, SafeTemplate + +class TestTemplate(unittest.TestCase): + + def test_regular_templates(self): + s = Template('$who likes to eat a bag of $what worth $$100') + self.assertEqual(s % dict(who='tim', what='ham'), + 'tim likes to eat a bag of ham worth $100') + self.assertRaises(KeyError, lambda s, d: s % d, s, dict(who='tim')) + + def test_regular_templates_with_braces(self): + s = Template('$who likes ${what} for ${meal}') + self.assertEqual(s % dict(who='tim', what='ham', meal='dinner'), + 'tim likes ham for dinner') + self.assertRaises(KeyError, lambda s, d: s % d, + s, dict(who='tim', what='ham')) + + def test_escapes(self): + eq = self.assertEqual + s = Template('$who likes to eat a bag of $$what worth $$100') + eq(s % dict(who='tim', what='ham'), + 'tim likes to eat a bag of $what worth $100') + s = Template('$who likes $$') + eq(s % dict(who='tim', what='ham'), 'tim likes $') + + def test_percents(self): + s = Template('%(foo)s $foo ${foo}') + self.assertEqual(s % dict(foo='baz'), '%(foo)s baz baz') + s = SafeTemplate('%(foo)s $foo ${foo}') + self.assertEqual(s % dict(foo='baz'), '%(foo)s baz baz') + + def test_stringification(self): + s = Template('tim has eaten $count bags of ham today') + self.assertEqual(s % dict(count=7), + 'tim has eaten 7 bags of ham today') + s = SafeTemplate('tim has eaten $count bags of ham today') + self.assertEqual(s % dict(count=7), + 'tim has eaten 7 bags of ham today') + s = SafeTemplate('tim has eaten ${count} bags of ham today') + self.assertEqual(s % dict(count=7), + 'tim has eaten 7 bags of ham today') + + def test_SafeTemplate(self): + eq = self.assertEqual + s = SafeTemplate('$who likes ${what} for ${meal}') + eq(s % dict(who='tim'), + 'tim likes ${what} for ${meal}') + eq(s % dict(what='ham'), + '$who likes ham for ${meal}') + eq(s % dict(what='ham', meal='dinner'), + '$who likes ham for dinner') + eq(s % dict(who='tim', what='ham'), + 'tim likes ham for ${meal}') + eq(s % dict(who='tim', what='ham', meal='dinner'), + 'tim likes ham for dinner') + + def test_invalid_placeholders(self): + raises = self.assertRaises + s = Template('$who likes $') + raises(ValueError, lambda s, d: s % d, s, dict(who='tim')) + s = Template('$who likes ${what)') + raises(ValueError, lambda s, d: s % d, s, dict(who='tim')) + s = Template('$who likes $100') + raises(ValueError, lambda s, d: s % d, s, dict(who='tim')) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestTemplate)) + return suite + + +def test_main(): + from test import test_support + test_support.run_suite(suite()) + + +if __name__ == '__main__': + unittest.main() @@ -57,6 +57,8 @@ Extension modules Library ------- +- PEP 292 classes Template and SafeTemplate are added to the string module. + - tarfile now generates GNU tar files by default. - HTTPResponse has now a getheaders method. |