diff options
-rw-r--r-- | Lib/encodings/__init__.py | 23 |
1 files changed, 19 insertions, 4 deletions
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 66bea5c..666afad 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -27,12 +27,17 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). """#" -import codecs, exceptions, re +import codecs, exceptions, types _cache = {} _unknown = '--unknown--' _import_tail = ['*'] -_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]') +_norm_encoding_map = (' . ' + '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + ' abcdefghijklmnopqrstuvwxyz ' + ' ' + ' ' + ' ') class CodecRegistryError(exceptions.LookupError, exceptions.SystemError): @@ -45,10 +50,20 @@ def normalize_encoding(encoding): Normalization works as follows: all non-alphanumeric characters except the dot used for Python package names are collapsed and replaced with a single underscore, e.g. ' -;#' - becomes '_'. + becomes '_'. Leading and trailing underscores are removed. + + Note that encoding names should be ASCII only; if they do use + non-ASCII characters, these must be Latin-1 compatible. """ - return '_'.join(_norm_encoding_RE.split(encoding)) + # Make sure we have an 8-bit string, because .translate() works + # differently for Unicode strings. + if type(encoding) is types.UnicodeType: + # Note that .encode('latin-1') does *not* use the codec + # registry, so this call doesn't recurse. (See unicodeobject.c + # PyUnicode_AsEncodedString() for details) + encoding = encoding.encode('latin-1') + return '_'.join(encoding.translate(_norm_encoding_map).split()) def search_function(encoding): |