summaryrefslogtreecommitdiffstats
path: root/Lib/encodings
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2003-05-16 17:07:51 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2003-05-16 17:07:51 (GMT)
commit282012593510a285fec5b8b5e42b04fef3ffffe0 (patch)
treed95d49ec6f9a35e2bf7f5d6d3137929c4c4358cd /Lib/encodings
parent813cec9a620e164d749ae9d36b72d0efd9260c07 (diff)
downloadcpython-282012593510a285fec5b8b5e42b04fef3ffffe0.zip
cpython-282012593510a285fec5b8b5e42b04fef3ffffe0.tar.gz
cpython-282012593510a285fec5b8b5e42b04fef3ffffe0.tar.bz2
Remove usage of re module from encodings package search function.
Diffstat (limited to 'Lib/encodings')
-rw-r--r--Lib/encodings/__init__.py23
1 files changed, 19 insertions, 4 deletions
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
index 66bea5c..666afad 100644
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -27,12 +27,17 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
"""#"
-import codecs, exceptions, re
+import codecs, exceptions, types
_cache = {}
_unknown = '--unknown--'
_import_tail = ['*']
-_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]')
+_norm_encoding_map = (' . '
+ '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ '
+ ' abcdefghijklmnopqrstuvwxyz '
+ ' '
+ ' '
+ ' ')
class CodecRegistryError(exceptions.LookupError,
exceptions.SystemError):
@@ -45,10 +50,20 @@ def normalize_encoding(encoding):
Normalization works as follows: all non-alphanumeric
characters except the dot used for Python package names are
collapsed and replaced with a single underscore, e.g. ' -;#'
- becomes '_'.
+ becomes '_'. Leading and trailing underscores are removed.
+
+ Note that encoding names should be ASCII only; if they do use
+ non-ASCII characters, these must be Latin-1 compatible.
"""
- return '_'.join(_norm_encoding_RE.split(encoding))
+ # Make sure we have an 8-bit string, because .translate() works
+ # differently for Unicode strings.
+ if type(encoding) is types.UnicodeType:
+ # Note that .encode('latin-1') does *not* use the codec
+ # registry, so this call doesn't recurse. (See unicodeobject.c
+ # PyUnicode_AsEncodedString() for details)
+ encoding = encoding.encode('latin-1')
+ return '_'.join(encoding.translate(_norm_encoding_map).split())
def search_function(encoding):