summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-12-19 19:21:40 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2013-12-19 19:21:40 (GMT)
commit16f02d2b06b8741259eb75070c4181a47fb6cfe6 (patch)
treee594ab5fc255b5d5e1711ba1523eaf361df94b1f
parented0b87d73c59f0cd1b02cdf5404898f9ca03cdfc (diff)
downloadcpython-16f02d2b06b8741259eb75070c4181a47fb6cfe6.zip
cpython-16f02d2b06b8741259eb75070c4181a47fb6cfe6.tar.gz
cpython-16f02d2b06b8741259eb75070c4181a47fb6cfe6.tar.bz2
Issue #5815: Fixed support for locales with modifiers. Fixed support for
locale encodings with hyphens.
-rw-r--r--Lib/locale.py118
-rw-r--r--Lib/test/test_locale.py58
2 files changed, 133 insertions, 43 deletions
diff --git a/Lib/locale.py b/Lib/locale.py
index 2e82c95..3651d74 100644
--- a/Lib/locale.py
+++ b/Lib/locale.py
@@ -336,6 +336,22 @@ def _test():
# overridden below)
_setlocale = setlocale
+def _replace_encoding(code, encoding):
+ if '.' in code:
+ langname = code[:code.index('.')]
+ else:
+ langname = code
+ # Convert the encoding to a C lib compatible encoding string
+ norm_encoding = encodings.normalize_encoding(encoding)
+ #print('norm encoding: %r' % norm_encoding)
+ norm_encoding = encodings.aliases.aliases.get(norm_encoding,
+ norm_encoding)
+ #print('aliased encoding: %r' % norm_encoding)
+ encoding = locale_encoding_alias.get(norm_encoding,
+ norm_encoding)
+ #print('found encoding %r' % encoding)
+ return langname + '.' + encoding
+
def normalize(localename):
""" Returns a normalized locale code for the given locale
@@ -352,55 +368,71 @@ def normalize(localename):
does.
"""
- # Normalize the locale name and extract the encoding
- fullname = localename.lower()
- if ':' in fullname:
+ # Normalize the locale name and extract the encoding and modifier
+ code = localename.lower()
+ if ':' in code:
# ':' is sometimes used as encoding delimiter.
- fullname = fullname.replace(':', '.')
- if '.' in fullname:
- langname, encoding = fullname.split('.')[:2]
- fullname = langname + '.' + encoding
+ code = code.replace(':', '.')
+ if '@' in code:
+ code, modifier = code.split('@', 1)
else:
- langname = fullname
+ modifier = ''
+ if '.' in code:
+ langname, encoding = code.split('.')[:2]
+ else:
+ langname = code
encoding = ''
- # First lookup: fullname (possibly with encoding)
- norm_encoding = encoding.replace('-', '')
- norm_encoding = norm_encoding.replace('_', '')
- lookup_name = langname + '.' + encoding
+ # First lookup: fullname (possibly with encoding and modifier)
+ lang_enc = langname
+ if encoding:
+ norm_encoding = encoding.replace('-', '')
+ norm_encoding = norm_encoding.replace('_', '')
+ lang_enc += '.' + norm_encoding
+ lookup_name = lang_enc
+ if modifier:
+ lookup_name += '@' + modifier
code = locale_alias.get(lookup_name, None)
if code is not None:
return code
- #print 'first lookup failed'
-
- # Second try: langname (without encoding)
- code = locale_alias.get(langname, None)
- if code is not None:
- #print 'langname lookup succeeded'
- if '.' in code:
- langname, defenc = code.split('.')
- else:
- langname = code
- defenc = ''
- if encoding:
- # Convert the encoding to a C lib compatible encoding string
- norm_encoding = encodings.normalize_encoding(encoding)
- #print 'norm encoding: %r' % norm_encoding
- norm_encoding = encodings.aliases.aliases.get(norm_encoding,
- norm_encoding)
- #print 'aliased encoding: %r' % norm_encoding
- encoding = locale_encoding_alias.get(norm_encoding,
- norm_encoding)
- else:
- encoding = defenc
- #print 'found encoding %r' % encoding
- if encoding:
- return langname + '.' + encoding
- else:
- return langname
-
- else:
- return localename
+ #print('first lookup failed')
+
+ if modifier:
+ # Second try: fullname without modifier (possibly with encoding)
+ code = locale_alias.get(lang_enc, None)
+ if code is not None:
+ #print('lookup without modifier succeeded')
+ if '@' not in code:
+ return code + '@' + modifier
+ if code.split('@', 1)[1].lower() == modifier:
+ return code
+ #print('second lookup failed')
+
+ if encoding:
+ # Third try: langname (without encoding, possibly with modifier)
+ lookup_name = langname
+ if modifier:
+ lookup_name += '@' + modifier
+ code = locale_alias.get(lookup_name, None)
+ if code is not None:
+ #print('lookup without encoding succeeded')
+ if '@' not in code:
+ return _replace_encoding(code, encoding)
+ code, modifier = code.split('@', 1)
+ return _replace_encoding(code, encoding) + '@' + modifier
+
+ if modifier:
+ # Fourth try: langname (without encoding and modifier)
+ code = locale_alias.get(langname, None)
+ if code is not None:
+ #print('lookup without modifier and encoding succeeded')
+ if '@' not in code:
+ return _replace_encoding(code, encoding) + '@' + modifier
+ code, defmod = code.split('@', 1)
+ if defmod.lower() == modifier:
+ return _replace_encoding(code, encoding) + '@' + defmod
+
+ return localename
def _parse_localename(localename):
@@ -419,7 +451,7 @@ def _parse_localename(localename):
code = normalize(localename)
if '@' in code:
# Deal with locale modifiers
- code, modifier = code.split('@')
+ code, modifier = code.split('@', 1)
if modifier == 'euro' and '.' not in code:
# Assume Latin-9 for @euro locales. This is bogus,
# since some systems may use other encodings for these
diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py
index 48bf36d..76202df 100644
--- a/Lib/test/test_locale.py
+++ b/Lib/test/test_locale.py
@@ -365,6 +365,64 @@ class TestEnUSCollation(BaseLocalizedTest, TestCollation):
self.assertLess(locale.strxfrm('à'), locale.strxfrm('b'))
+class NormalizeTest(unittest.TestCase):
+ def check(self, localename, expected):
+ self.assertEqual(locale.normalize(localename), expected, msg=localename)
+
+ def test_locale_alias(self):
+ for localename, alias in locale.locale_alias.items():
+ with self.subTest(locale=(localename, alias)):
+ self.check(localename, alias)
+
+ def test_empty(self):
+ self.check('', '')
+
+ def test_c(self):
+ self.check('c', 'C')
+ self.check('posix', 'C')
+
+ def test_english(self):
+ self.check('en', 'en_US.ISO8859-1')
+ self.check('EN', 'en_US.ISO8859-1')
+ self.check('en_US', 'en_US.ISO8859-1')
+ self.check('en_us', 'en_US.ISO8859-1')
+ self.check('en_GB', 'en_GB.ISO8859-1')
+ self.check('en_US.UTF-8', 'en_US.UTF-8')
+ self.check('en_US.utf8', 'en_US.UTF-8')
+ self.check('en_US:UTF-8', 'en_US.UTF-8')
+ self.check('en_US.ISO8859-1', 'en_US.ISO8859-1')
+ self.check('en_US.US-ASCII', 'en_US.ISO8859-1')
+ self.check('english', 'en_EN.ISO8859-1')
+
+ def test_hyphenated_encoding(self):
+ self.check('az_AZ.iso88599e', 'az_AZ.ISO8859-9E')
+ self.check('az_AZ.ISO8859-9E', 'az_AZ.ISO8859-9E')
+ self.check('tt_RU.koi8c', 'tt_RU.KOI8-C')
+ self.check('tt_RU.KOI8-C', 'tt_RU.KOI8-C')
+ self.check('lo_LA.cp1133', 'lo_LA.IBM-CP1133')
+ self.check('lo_LA.ibmcp1133', 'lo_LA.IBM-CP1133')
+ self.check('lo_LA.IBM-CP1133', 'lo_LA.IBM-CP1133')
+ self.check('uk_ua.microsoftcp1251', 'uk_UA.CP1251')
+ self.check('uk_ua.microsoft-cp1251', 'uk_UA.CP1251')
+ self.check('ka_ge.georgianacademy', 'ka_GE.GEORGIAN-ACADEMY')
+ self.check('ka_GE.GEORGIAN-ACADEMY', 'ka_GE.GEORGIAN-ACADEMY')
+ self.check('cs_CZ.iso88592', 'cs_CZ.ISO8859-2')
+ self.check('cs_CZ.ISO8859-2', 'cs_CZ.ISO8859-2')
+
+ def test_euro_modifier(self):
+ self.check('de_DE@euro', 'de_DE.ISO8859-15')
+ self.check('en_US.ISO8859-15@euro', 'en_US.ISO8859-15')
+
+ def test_latin_modifier(self):
+ self.check('be_BY.UTF-8@latin', 'be_BY.UTF-8@latin')
+ self.check('sr_RS.UTF-8@latin', 'sr_RS.UTF-8@latin')
+
+ def test_valencia_modifier(self):
+ self.check('ca_ES.UTF-8@valencia', 'ca_ES.UTF-8@valencia')
+ self.check('ca_ES@valencia', 'ca_ES.ISO8859-1@valencia')
+ self.check('ca@valencia', 'ca_ES.ISO8859-1@valencia')
+
+
class TestMiscellaneous(unittest.TestCase):
def test_getpreferredencoding(self):
# Invoke getpreferredencoding to make sure it does not cause exceptions.